From 086c1880ac600e8d4b043fc8206298e9e964081d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 15 Oct 2025 12:24:21 +0200 Subject: [PATCH 01/91] binarization: add option `--overwrite`, skip existing outputs (also, simplify `run` and separate `run_single`) --- src/eynollah/cli.py | 16 ++++-- src/eynollah/sbb_binarize.py | 96 +++++++++++++++--------------------- 2 files changed, 52 insertions(+), 60 deletions(-) diff --git a/src/eynollah/cli.py b/src/eynollah/cli.py index c9bad52..e4a24e4 100644 --- a/src/eynollah/cli.py +++ b/src/eynollah/cli.py @@ -79,18 +79,28 @@ def machine_based_reading_order(input, dir_in, out, model, log_level): type=click.Path(file_okay=True, dir_okay=True), required=True, ) +@click.option( + "--overwrite", + "-O", + help="overwrite (instead of skipping) if output xml exists", + is_flag=True, +) @click.option( "--log_level", "-l", type=click.Choice(['OFF', 'DEBUG', 'INFO', 'WARN', 'ERROR']), help="Override log level globally to this", ) -def binarization(patches, model_dir, input_image, dir_in, output, log_level): +def binarization(patches, model_dir, input_image, dir_in, output, overwrite, log_level): assert bool(input_image) != bool(dir_in), "Either -i (single input) or -di (directory) must be provided, but not both." binarizer = SbbBinarizer(model_dir) if log_level: - binarizer.log.setLevel(getLevelName(log_level)) - binarizer.run(image_path=input_image, use_patches=patches, output=output, dir_in=dir_in) + binarizer.logger.setLevel(getLevelName(log_level)) + binarizer.run(overwrite=overwrite, + use_patches=patches, + image_path=input_image, + output=output, + dir_in=dir_in) @main.command() diff --git a/src/eynollah/sbb_binarize.py b/src/eynollah/sbb_binarize.py index 3716987..0eab2ae 100644 --- a/src/eynollah/sbb_binarize.py +++ b/src/eynollah/sbb_binarize.py @@ -25,7 +25,7 @@ class SbbBinarizer: def __init__(self, model_dir, logger=None): self.model_dir = model_dir - self.log = logger if logger else logging.getLogger('SbbBinarizer') + self.logger = logger if logger else logging.getLogger('SbbBinarizer') self.start_new_session() @@ -315,64 +315,46 @@ class SbbBinarizer: prediction_true = prediction_true.astype(np.uint8) return prediction_true[:,:,0] - def run(self, image=None, image_path=None, output=None, use_patches=False, dir_in=None): - # print(dir_in,'dir_in') - if not dir_in: - if (image is not None and image_path is not None) or \ - (image is None and image_path is None): - raise ValueError("Must pass either a opencv2 image or an image_path") - if image_path is not None: - image = cv2.imread(image_path) - img_last = 0 - for n, (model, model_file) in enumerate(zip(self.models, self.model_files)): - self.log.info('Predicting with model %s [%s/%s]' % (model_file, n + 1, len(self.model_files))) - - res = self.predict(model, image, use_patches) - - img_fin = np.zeros((res.shape[0], res.shape[1], 3)) - res[:, :][res[:, :] == 0] = 2 - res = res - 1 - res = res * 255 - img_fin[:, :, 0] = res - img_fin[:, :, 1] = res - img_fin[:, :, 2] = res - - img_fin = img_fin.astype(np.uint8) - img_fin = (res[:, :] == 0) * 255 - img_last = img_last + img_fin - - kernel = np.ones((5, 5), np.uint8) - img_last[:, :][img_last[:, :] > 0] = 255 - img_last = (img_last[:, :] == 0) * 255 - if output: - cv2.imwrite(output, img_last) - return img_last + def run(self, image_path=None, output=None, dir_in=None, use_patches=False, overwrite=False): + if dir_in: + ls_imgs = [(os.path.join(dir_in, image_filename), + os.path.join(output, os.path.splitext(image_filename)[0] + '.png')) + for image_filename in filter(is_image_filename, + os.listdir(dir_in))] else: - ls_imgs = list(filter(is_image_filename, os.listdir(dir_in))) - for image_name in ls_imgs: - image_stem = image_name.split('.')[0] - print(image_name,'image_name') - image = cv2.imread(os.path.join(dir_in,image_name) ) - img_last = 0 - for n, (model, model_file) in enumerate(zip(self.models, self.model_files)): - self.log.info('Predicting with model %s [%s/%s]' % (model_file, n + 1, len(self.model_files))) + ls_imgs = [(image_path, output)] - res = self.predict(model, image, use_patches) + for input_path, output_path in ls_imgs: + print(input_path, 'image_name') + if os.path.exists(output_path): + if overwrite: + self.logger.warning("will overwrite existing output file '%s'", output_ptah) + else: + self.logger.warning("will skip input for existing output file '%s'", output_path) + image = cv2.imread(input_path) + result = self.run_single(image, use_patches) + cv2.imwrite(output_path, result) - img_fin = np.zeros((res.shape[0], res.shape[1], 3)) - res[:, :][res[:, :] == 0] = 2 - res = res - 1 - res = res * 255 - img_fin[:, :, 0] = res - img_fin[:, :, 1] = res - img_fin[:, :, 2] = res + def run_single(self, image: np.ndarray, use_patches=False): + img_last = 0 + for n, (model, model_file) in enumerate(zip(self.models, self.model_files)): + self.logger.info('Predicting with model %s [%s/%s]' % (model_file, n + 1, len(self.model_files))) - img_fin = img_fin.astype(np.uint8) - img_fin = (res[:, :] == 0) * 255 - img_last = img_last + img_fin + res = self.predict(model, image, use_patches) - kernel = np.ones((5, 5), np.uint8) - img_last[:, :][img_last[:, :] > 0] = 255 - img_last = (img_last[:, :] == 0) * 255 - - cv2.imwrite(os.path.join(output, image_stem + '.png'), img_last) + img_fin = np.zeros((res.shape[0], res.shape[1], 3)) + res[:, :][res[:, :] == 0] = 2 + res = res - 1 + res = res * 255 + img_fin[:, :, 0] = res + img_fin[:, :, 1] = res + img_fin[:, :, 2] = res + + img_fin = img_fin.astype(np.uint8) + img_fin = (res[:, :] == 0) * 255 + img_last = img_last + img_fin + + kernel = np.ones((5, 5), np.uint8) + img_last[:, :][img_last[:, :] > 0] = 255 + img_last = (img_last[:, :] == 0) * 255 + return img_last From 184927fb5488f440948320ca97d716144da5012c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 20 Oct 2025 13:16:57 +0200 Subject: [PATCH 02/91] `find_num_cols`: re-sort peaks when cutting n-best `num_col_classifier` --- src/eynollah/utils/__init__.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 5ccb2af..7c47407 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -463,22 +463,19 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl interest_neg_fin = interest_neg[(interest_neg < grenze)] peaks_neg_fin = peaks_neg[(interest_neg < grenze)] - # interest_neg_fin=interest_neg[(interest_neg= 3: - index_sort_interest_neg_fin= np.argsort(interest_neg_fin) - peaks_neg_sorted = np.array(peaks_neg)[index_sort_interest_neg_fin] - interest_neg_fin_sorted = np.array(interest_neg_fin)[index_sort_interest_neg_fin] + # found too few columns here: ignore 'grenze' and take the deepest N peaks + sort_by_height = np.argsort(interest_neg)[:num_col_classifier] + peaks_neg_fin = peaks_neg[sort_by_height] + interest_neg_fin = interest_neg[sort_by_height] + # print(peaks_neg_fin, "peaks_neg[sorted_by_height]") + sort_by_pos = np.argsort(peaks_neg_fin) + peaks_neg_fin = peaks_neg_fin[sort_by_pos] + interest_neg_fin = interest_neg_fin[sort_by_pos] - if len(index_sort_interest_neg_fin)>=num_col_classifier: - peaks_neg_fin = list( peaks_neg_sorted[:num_col_classifier] ) - interest_neg_fin = list( interest_neg_fin_sorted[:num_col_classifier] ) - else: - peaks_neg_fin = peaks_neg[:] - interest_neg_fin = interest_neg[:] - - num_col = (len(interest_neg_fin)) + 1 + num_col = len(interest_neg_fin) + 1 # print(peaks_neg_fin,'peaks_neg_fin') # print(num_col,'diz') From 48761c3e127bfde488cc3ff6dd7edc97eb85bfd0 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 20 Oct 2025 13:20:12 +0200 Subject: [PATCH 03/91] `find_num_col`: simplify, add better plotting (but commented out) --- src/eynollah/utils/__init__.py | 208 +++++++++++++++++---------------- 1 file changed, 108 insertions(+), 100 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 7c47407..ce72df4 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -396,16 +396,18 @@ def find_num_col_deskew(regions_without_separators, sigma_, multiplier=3.8): def find_num_col(regions_without_separators, num_col_classifier, tables, multiplier=3.8): if not regions_without_separators.any(): return 0, [] - #plt.imshow(regions_without_separators) - #plt.show() regions_without_separators_0 = regions_without_separators.sum(axis=0) - ##plt.plot(regions_without_separators_0) - ##plt.show() + # fig, (ax1, ax2) = plt.subplots(2, sharex=True) + # ax1.imshow(regions_without_separators, aspect="auto") + # ax2.plot(regions_without_separators_0) + # plt.show() sigma_ = 35 # 70#35 - meda_n_updown = regions_without_separators_0[len(regions_without_separators_0) :: -1] + meda_n_updown = regions_without_separators_0[::-1] first_nonzero = next((i for i, x in enumerate(regions_without_separators_0) if x), 0) last_nonzero = next((i for i, x in enumerate(meda_n_updown) if x), 0) last_nonzero = len(regions_without_separators_0) - last_nonzero + last_nonzero = last_nonzero - 100 + first_nonzero = first_nonzero + 200 y = regions_without_separators_0 # [first_nonzero:last_nonzero] y_help = np.zeros(len(y) + 20) y_help[10 : len(y) + 10] = y @@ -416,28 +418,44 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl z = gaussian_filter1d(y, sigma_) zneg = gaussian_filter1d(zneg, sigma_) - peaks_neg, _ = find_peaks(zneg, height=0) - #plt.plot(zneg) - #plt.plot(peaks_neg, zneg[peaks_neg], 'rx') - #plt.show() peaks, _ = find_peaks(z, height=0) + peaks_neg, _ = find_peaks(zneg, height=0) + # _, (ax1, ax2) = plt.subplots(2, sharex=True) + # ax1.set_title("z") + # ax1.plot(z) + # ax1.scatter(peaks, z[peaks]) + # ax1.axvline(0.06 * len(y), label="first") + # ax1.axvline(0.94 * len(y), label="last") + # ax1.text(0.06 * len(y), 0, "first", rotation=90) + # ax1.text(0.94 * len(y), 0, "last", rotation=90) + # ax1.axhline(10, label="minimum") + # ax1.text(0, 10, "minimum") + # ax2.set_title("zneg") + # ax2.plot(zneg) + # ax2.scatter(peaks_neg, zneg[peaks_neg]) + # ax2.axvline(first_nonzero, label="first nonzero") + # ax2.axvline(last_nonzero, label="last nonzero") + # ax2.text(first_nonzero, 0, "first nonzero", rotation=90) + # ax2.text(last_nonzero, 0, "last nonzero", rotation=90) + # ax2.axvline(370, label="first") + # ax2.axvline(len(y) - 370, label="last") + # ax2.text(370, 0, "first", rotation=90) + # ax2.text(len(y) - 370, 0, "last", rotation=90) + # plt.show() peaks_neg = peaks_neg - 10 - 10 - last_nonzero = last_nonzero - 100 - first_nonzero = first_nonzero + 200 - - peaks_neg = peaks_neg[(peaks_neg > first_nonzero) & - (peaks_neg < last_nonzero)] - peaks = peaks[(peaks > 0.06 * regions_without_separators.shape[1]) & - (peaks < 0.94 * regions_without_separators.shape[1])] - peaks_neg = peaks_neg[(peaks_neg > 370) & - (peaks_neg < (regions_without_separators.shape[1] - 370))] + peaks = peaks[(peaks > 0.06 * len(y)) & + (peaks < 0.94 * len(y))] interest_pos = z[peaks] interest_pos = interest_pos[interest_pos > 10] if not interest_pos.any(): return 0, [] # plt.plot(z) # plt.show() + peaks_neg = peaks_neg[(peaks_neg > first_nonzero) & + (peaks_neg < last_nonzero)] + peaks_neg = peaks_neg[(peaks_neg > 370) & + (peaks_neg < len(y) - 370)] interest_neg = z[peaks_neg] if not interest_neg.any(): return 0, [] @@ -445,21 +463,28 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl min_peaks_pos = np.min(interest_pos) max_peaks_pos = np.max(interest_pos) - if max_peaks_pos / min_peaks_pos >= 35: + #print(min_peaks_pos, max_peaks_pos, max_peaks_pos / min_peaks_pos, 'minmax') + if max_peaks_pos / (min_peaks_pos or 1e-9) >= 35: min_peaks_pos = np.mean(interest_pos) min_peaks_neg = 0 # np.min(interest_neg) - # print(np.min(interest_pos),np.max(interest_pos),np.max(interest_pos)/np.min(interest_pos),'minmax') dis_talaei = (min_peaks_pos - min_peaks_neg) / multiplier grenze = min_peaks_pos - dis_talaei - # np.mean(y[peaks_neg[0]:peaks_neg[len(peaks_neg)-1]])-np.std(y[peaks_neg[0]:peaks_neg[len(peaks_neg)-1]])/2.0 + #np.mean(y[peaks_neg[0]:peaks_neg[-1]])-np.std(y[peaks_neg[0]:peaks_neg[-1]])/2.0 # print(interest_neg,'interest_neg') # print(grenze,'grenze') # print(min_peaks_pos,'min_peaks_pos') # print(dis_talaei,'dis_talaei') # print(peaks_neg,'peaks_neg') + # fig, (ax1, ax2) = plt.subplots(2, sharex=True) + # ax1.imshow(regions_without_separators, aspect="auto") + # ax2.plot(z) + # ax2.scatter(peaks_neg, z[peaks_neg]) + # ax2.axhline(grenze, label="grenze") + # ax2.text(0, grenze, "grenze") + # plt.show() interest_neg_fin = interest_neg[(interest_neg < grenze)] peaks_neg_fin = peaks_neg[(interest_neg < grenze)] @@ -479,46 +504,38 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl # print(peaks_neg_fin,'peaks_neg_fin') # print(num_col,'diz') - p_l = 0 - p_u = len(y) - 1 - p_m = int(len(y) / 2.0) - p_g_l = int(len(y) / 4.0) - p_g_u = len(y) - int(len(y) / 4.0) - - if num_col == 3: - if ((peaks_neg_fin[0] > p_g_u and - peaks_neg_fin[1] > p_g_u) or - (peaks_neg_fin[0] < p_g_l and - peaks_neg_fin[1] < p_g_l) or - (peaks_neg_fin[0] + 200 < p_m and - peaks_neg_fin[1] < p_m) or - (peaks_neg_fin[0] - 200 > p_m and - peaks_neg_fin[1] > p_m)): - num_col = 1 - peaks_neg_fin = [] - - if num_col == 2: - if (peaks_neg_fin[0] > p_g_u or - peaks_neg_fin[0] < p_g_l): - num_col = 1 - peaks_neg_fin = [] + # cancel if resulting split is highly unbalanced across available width + if ((num_col == 3 and + ((peaks_neg_fin[0] > 0.75 * len(y) and + peaks_neg_fin[1] > 0.75 * len(y)) or + (peaks_neg_fin[0] < 0.25 * len(y) and + peaks_neg_fin[1] < 0.25 * len(y)) or + (peaks_neg_fin[0] < 0.5 * len(y) - 200 and + peaks_neg_fin[1] < 0.5 * len(y)) or + (peaks_neg_fin[0] > 0.5 * len(y) + 200 and + peaks_neg_fin[1] > 0.5 * len(y)))) or + (num_col == 2 and + (peaks_neg_fin[0] > 0.75 * len(y) or + peaks_neg_fin[0] < 0.25 * len(y)))): + num_col = 1 + peaks_neg_fin = [] ##print(len(peaks_neg_fin)) + # filter out peaks that are too close (<400px) to each other: + # among each group, pick the position with smallest amount of text diff_peaks = np.abs(np.diff(peaks_neg_fin)) cut_off = 400 peaks_neg_true = [] forest = [] - # print(len(peaks_neg_fin),'len_') - for i in range(len(peaks_neg_fin)): if i == 0: forest.append(peaks_neg_fin[i]) if i < len(peaks_neg_fin) - 1: if diff_peaks[i] <= cut_off: forest.append(peaks_neg_fin[i + 1]) - if diff_peaks[i] > cut_off: + else: # print(forest[np.argmin(z[forest]) ] ) if not isNaN(forest[np.argmin(z[forest])]): peaks_neg_true.append(forest[np.argmin(z[forest])]) @@ -530,68 +547,59 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl peaks_neg_true.append(forest[np.argmin(z[forest])]) num_col = len(peaks_neg_true) + 1 - p_l = 0 - p_u = len(y) - 1 - p_m = int(len(y) / 2.0) - p_quarter = int(len(y) / 5.0) - p_g_l = int(len(y) / 4.0) - p_g_u = len(y) - int(len(y) / 4.0) - - p_u_quarter = len(y) - p_quarter - + #print(peaks_neg_true, "peaks_neg_true") ##print(num_col,'early') - if num_col == 3: - if ((peaks_neg_true[0] > p_g_u and - peaks_neg_true[1] > p_g_u) or - (peaks_neg_true[0] < p_g_l and - peaks_neg_true[1] < p_g_l) or - (peaks_neg_true[0] < p_m and - peaks_neg_true[1] + 200 < p_m) or - (peaks_neg_true[0] - 200 > p_m and - peaks_neg_true[1] > p_m)): - num_col = 1 - peaks_neg_true = [] - elif (peaks_neg_true[0] < p_g_u and - peaks_neg_true[0] > p_g_l and - peaks_neg_true[1] > p_u_quarter): - peaks_neg_true = [peaks_neg_true[0]] - elif (peaks_neg_true[1] < p_g_u and - peaks_neg_true[1] > p_g_l and - peaks_neg_true[0] < p_quarter): - peaks_neg_true = [peaks_neg_true[1]] + # cancel if resulting split is highly unbalanced across available width + if ((num_col == 3 and + ((peaks_neg_true[0] > 0.75 * len(y) and + peaks_neg_true[1] > 0.75 * len(y)) or + (peaks_neg_true[0] < 0.25 * len(y) and + peaks_neg_true[1] < 0.25 * len(y)) or + (peaks_neg_true[0] < 0.5 * len(y) - 200 and + peaks_neg_true[1] < 0.5 * len(y)) or + (peaks_neg_true[0] > 0.5 * len(y) + 200 and + peaks_neg_true[1] > 0.5 * len(y)))) or + (num_col == 2 and + (peaks_neg_true[0] > 0.75 * len(y) or + peaks_neg_true[0] < 0.25 * len(y)))): + num_col = 1 + peaks_neg_true = [] + if (num_col == 3 and + (peaks_neg_true[0] < 0.75 * len(y) and + peaks_neg_true[0] > 0.25 * len(y) and + peaks_neg_true[1] > 0.80 * len(y))): + num_col = 2 + peaks_neg_true = [peaks_neg_true[0]] + if (num_col == 3 and + (peaks_neg_true[1] < 0.75 * len(y) and + peaks_neg_true[1] > 0.25 * len(y) and + peaks_neg_true[0] < 0.20 * len(y))): + num_col = 2 + peaks_neg_true = [peaks_neg_true[1]] - if num_col == 2: - if (peaks_neg_true[0] > p_g_u or - peaks_neg_true[0] < p_g_l): - num_col = 1 - peaks_neg_true = [] + # get rid of too narrow columns (not used) + # if np.count_nonzero(diff_peaks < 360): + # arg_help = np.arange(len(diff_peaks)) + # arg_help_ann = arg_help[diff_peaks < 360] + # peaks_neg_fin_new = [] + # for ii in range(len(peaks_neg_fin)): + # if ii in arg_help_ann: + # if interest_neg_fin[ii] < interest_neg_fin[ii + 1]: + # peaks_neg_fin_new.append(peaks_neg_fin[ii]) + # else: + # peaks_neg_fin_new.append(peaks_neg_fin[ii + 1]) - diff_peaks_abnormal = diff_peaks[diff_peaks < 360] - - if len(diff_peaks_abnormal) > 0: - arg_help = np.arange(len(diff_peaks)) - arg_help_ann = arg_help[diff_peaks < 360] - - peaks_neg_fin_new = [] - - for ii in range(len(peaks_neg_fin)): - if ii in arg_help_ann: - arg_min = np.argmin([interest_neg_fin[ii], interest_neg_fin[ii + 1]]) - if arg_min == 0: - peaks_neg_fin_new.append(peaks_neg_fin[ii]) - else: - peaks_neg_fin_new.append(peaks_neg_fin[ii + 1]) - - elif (ii - 1) not in arg_help_ann: - peaks_neg_fin_new.append(peaks_neg_fin[ii]) - else: - peaks_neg_fin_new = peaks_neg_fin + # elif (ii - 1) not in arg_help_ann: + # peaks_neg_fin_new.append(peaks_neg_fin[ii]) + # else: + # peaks_neg_fin_new = peaks_neg_fin # plt.plot(gaussian_filter1d(y, sigma_)) # plt.plot(peaks_neg_true,z[peaks_neg_true],'*') # plt.plot([0,len(y)], [grenze,grenze]) # plt.show() ##print(len(peaks_neg_true)) + #print(peaks_neg_true, "peaks_neg_true") return len(peaks_neg_true), peaks_neg_true def find_num_col_only_image(regions_without_separators, multiplier=3.8): From c43a825d1d26c36beee3bbc2e038f8c0cda4221b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 20 Oct 2025 13:26:01 +0200 Subject: [PATCH 04/91] `order_of_regions`: filter out-of-image peaks --- src/eynollah/utils/__init__.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index ce72df4..677ed53 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1216,15 +1216,16 @@ def order_of_regions(textline_mask, contours_main, contours_head, y_ref): peaks_neg, _ = find_peaks(zneg, height=0) peaks_neg = peaks_neg - 20 - 20 - ##plt.plot(z) - ##plt.show() - cx_main, cy_main = find_center_of_contours(contours_main) - cx_head, cy_head = find_center_of_contours(contours_head) - - peaks_neg_new = np.append(np.insert(peaks_neg, 0, 0), textline_mask.shape[0]) + peaks_neg_new = np.array([0] + + # peaks can be beyond box due to padding and smoothing + [peak for peak in peaks_neg + if 0 < peak and peak < textline_mask.shape[0]] + + [textline_mask.shape[0]]) # offset from bbox of mask peaks_neg_new += y_ref + cx_main, cy_main = find_center_of_contours(contours_main) + cx_head, cy_head = find_center_of_contours(contours_head) # assert not len(cy_main) or np.min(peaks_neg_new) <= np.min(cy_main) and np.max(cy_main) <= np.max(peaks_neg_new) # assert not len(cy_head) or np.min(peaks_neg_new) <= np.min(cy_head) and np.max(cy_head) <= np.max(peaks_neg_new) From d3d599b0108bf17802bda2f9808620e3cd8471db Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 20 Oct 2025 13:27:23 +0200 Subject: [PATCH 05/91] `order_of_regions`: add better plotting (but commented out) --- src/eynollah/eynollah.py | 2 +- src/eynollah/utils/__init__.py | 20 +++++++++++++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 13acba6..9412861 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -2553,7 +2553,7 @@ class Eynollah: con_inter_box_h = contours_only_text_parent_h[args_contours_box_head] indexes_sorted, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( - textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, box[2]) + textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, box[2], box[0]) order_of_texts, id_of_texts = order_and_id_of_texts( con_inter_box, con_inter_box_h, diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 677ed53..f2e3581 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1197,7 +1197,7 @@ def small_textlines_to_parent_adherence2(textlines_con, textline_iamge, num_col) textlines_con_changed.append(textlines_big_org_form) return textlines_con_changed -def order_of_regions(textline_mask, contours_main, contours_head, y_ref): +def order_of_regions(textline_mask, contours_main, contours_head, y_ref, x_ref): ##plt.imshow(textline_mask) ##plt.show() y = textline_mask.sum(axis=1) # horizontal projection profile @@ -1208,6 +1208,8 @@ def order_of_regions(textline_mask, contours_main, contours_head, y_ref): #z = gaussian_filter1d(y_padded, sigma_gaus) #peaks, _ = find_peaks(z, height=0) #peaks = peaks - 20 + ##plt.plot(z) + ##plt.show() zneg_rev = np.max(y_padded) - y_padded zneg = np.zeros(len(zneg_rev) + 40) zneg[20 : len(zneg_rev) + 20] = zneg_rev @@ -1250,6 +1252,22 @@ def order_of_regions(textline_mask, contours_main, contours_head, y_ref): indexes_in, types_in, cxs_in, cys_in, typed_indexes_in = \ matrix_of_orders[(matrix_of_orders[:, 3] >= top) & (matrix_of_orders[:, 3] < bot)].T + # if indexes_in.size: + # img = textline_mask.copy() + # plt.imshow(img) + # plt.gca().add_patch(patches.Rectangle((0, top-y_ref), img.shape[1], bot-top, alpha=0.5, color='gray')) + # xrange = np.arange(0, img.shape[1], 50) + # yrange = np.arange(0, img.shape[0], 50) + # plt.gca().set_xticks(xrange, xrange + x_ref) + # plt.gca().set_yticks(yrange, yrange + y_ref) + # for idx, type_, cx, cy in zip(typed_indexes_in, types_in, cxs_in, cys_in): + # cnt = (contours_main if type_ == 1 else contours_head)[idx] + # col = 'red' if type_ == 1 else 'blue' + # plt.scatter(cx - x_ref, cy - y_ref, 20, c=col, marker='o') + # plt.gca().add_patch(patches.Polygon(cnt[:, 0] - [[x_ref, y_ref]], closed=False, fill=False, color=col)) + # plt.title("box contours centered in %d:%d (red=main / blue=heading)" % (top, bot)) + # plt.show() + sorted_inside = np.argsort(cxs_in) final_indexers_sorted.extend(indexes_in[sorted_inside]) final_types.extend(types_in[sorted_inside]) From 542d38ab432e3089ebc8fefd3caee2915fe6b031 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 20 Oct 2025 13:34:56 +0200 Subject: [PATCH 06/91] =?UTF-8?q?`find=5Fnumber=5Fof=5Fcolumns=5Fin=5Fdocu?= =?UTF-8?q?ment`:=20simplify,=20rename=20`line`=E2=86=92`seps`?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/eynollah/utils/__init__.py | 244 +++++++++++++++------------------ 1 file changed, 109 insertions(+), 135 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index f2e3581..168899f 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1377,175 +1377,149 @@ def return_points_with_boundies(peaks_neg_fin, first_point, last_point): peaks_neg_tot.append(last_point) return peaks_neg_tot -def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, label_lines, contours_h=None): - t_ins_c0 = time.time() - separators_closeup=( (region_pre_p[:,:]==label_lines))*1 - separators_closeup[0:110,:]=0 - separators_closeup[separators_closeup.shape[0]-150:,:]=0 +def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, label_seps, contours_h=None): + separators_closeup = 1 * (region_pre_p == label_seps) + separators_closeup[0:110] = 0 + separators_closeup[-150:] = 0 kernel = np.ones((5,5),np.uint8) - separators_closeup=separators_closeup.astype(np.uint8) - separators_closeup = cv2.dilate(separators_closeup,kernel,iterations = 1) - separators_closeup = cv2.erode(separators_closeup,kernel,iterations = 1) + separators_closeup = separators_closeup.astype(np.uint8) + separators_closeup = cv2.morphologyEx(separators_closeup, cv2.MORPH_CLOSE, kernel, iterations=1) - separators_closeup_new=np.zeros((separators_closeup.shape[0] ,separators_closeup.shape[1] )) - separators_closeup_n=np.copy(separators_closeup) - separators_closeup_n=separators_closeup_n.astype(np.uint8) + separators_closeup_n = separators_closeup.astype(np.uint8) # to be returned - separators_closeup_n_binary=np.zeros(( separators_closeup_n.shape[0],separators_closeup_n.shape[1]) ) - separators_closeup_n_binary[:,:]=separators_closeup_n[:,:] - separators_closeup_n_binary[:,:][separators_closeup_n_binary[:,:]!=0]=1 + separators_closeup_n_binary = separators_closeup_n.copy() - _, thresh_e = cv2.threshold(separators_closeup_n_binary, 0, 255, 0) - contours_line_e, _ = cv2.findContours(thresh_e.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - _, dist_xe, _, _, _, _, y_min_main, y_max_main, _ = \ - find_features_of_lines(contours_line_e) - dist_ye = y_max_main - y_min_main - args_e=np.arange(len(contours_line_e)) - args_hor_e=args_e[(dist_ye<=50) & - (dist_xe>=3*dist_ye)] - cnts_hor_e=[] - for ce in args_hor_e: - cnts_hor_e.append(contours_line_e[ce]) + # find horizontal lines by contour properties + contours_sep_e, _ = cv2.findContours(separators_closeup_n_binary, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + cnts_hor_e = [] + for cnt in contours_sep_e: + max_xe = cnt[:, 0, 0].max() + min_xe = cnt[:, 0, 0].min() + max_ye = cnt[:, 0, 1].max() + min_ye = cnt[:, 0, 1].min() + dist_xe = max_xe - min_xe + dist_ye = max_ye - min_ye + if dist_ye <= 50 and dist_xe >= 3 * dist_ye: + cnts_hor_e.append(cnt) - separators_closeup_n_binary=cv2.fillPoly(separators_closeup_n_binary, pts=cnts_hor_e, color=0) - gray = cv2.bitwise_not(separators_closeup_n_binary) - gray=gray.astype(np.uint8) + # delete horizontal contours (leaving only the edges) + separators_closeup_n_binary = cv2.fillPoly(separators_closeup_n_binary, pts=cnts_hor_e, color=0) + edges = cv2.adaptiveThreshold(separators_closeup_n_binary * 255, 255, + cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 15, -2) + horizontal = np.copy(edges) + vertical = np.copy(edges) - bw = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C, \ - cv2.THRESH_BINARY, 15, -2) - horizontal = np.copy(bw) - vertical = np.copy(bw) - - cols = horizontal.shape[1] - horizontal_size = cols // 30 - # Create structure element for extracting horizontal lines through morphology operations + horizontal_size = horizontal.shape[1] // 30 + # find horizontal lines by morphology horizontalStructure = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontal_size, 1)) - # Apply morphology operations - horizontal = cv2.erode(horizontal, horizontalStructure) - horizontal = cv2.dilate(horizontal, horizontalStructure) - - kernel = np.ones((5,5),np.uint8) - horizontal = cv2.dilate(horizontal,kernel,iterations = 2) - horizontal = cv2.erode(horizontal,kernel,iterations = 2) + horizontal = cv2.morphologyEx(horizontal, cv2.MORPH_OPEN, horizontalStructure) + horizontal = cv2.morphologyEx(horizontal, cv2.MORPH_CLOSE, kernel, iterations=2) + # re-insert deleted horizontal contours horizontal = cv2.fillPoly(horizontal, pts=cnts_hor_e, color=255) - rows = vertical.shape[0] - verticalsize = rows // 30 - # Create structure element for extracting vertical lines through morphology operations - verticalStructure = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize)) - # Apply morphology operations - vertical = cv2.erode(vertical, verticalStructure) - vertical = cv2.dilate(vertical, verticalStructure) - vertical = cv2.dilate(vertical,kernel,iterations = 1) + vertical_size = vertical.shape[0] // 30 + # find vertical lines by morphology + verticalStructure = cv2.getStructuringElement(cv2.MORPH_RECT, (1, vertical_size)) + vertical = cv2.morphologyEx(vertical, cv2.MORPH_OPEN, verticalStructure) + vertical = cv2.dilate(vertical, kernel, iterations=1) horizontal, special_separators = \ combine_hor_lines_and_delete_cross_points_and_get_lines_features_back_new( vertical, horizontal, num_col_classifier) - separators_closeup_new[:,:][vertical[:,:]!=0]=1 - separators_closeup_new[:,:][horizontal[:,:]!=0]=1 - _, thresh = cv2.threshold(vertical, 0, 255, 0) - contours_line_vers, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - slope_lines, dist_x, x_min_main, x_max_main, cy_main, slope_lines_org, y_min_main, y_max_main, cx_main = \ - find_features_of_lines(contours_line_vers) + contours_sep_vers, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + slope_seps, dist_x, x_min_seps, x_max_seps, cy_seps, slope_seps_org, y_min_seps, y_max_seps, cx_seps = \ + find_features_of_lines(contours_sep_vers) - args=np.arange(len(slope_lines)) - args_ver=args[slope_lines==1] - dist_x_ver=dist_x[slope_lines==1] - y_min_main_ver=y_min_main[slope_lines==1] - y_max_main_ver=y_max_main[slope_lines==1] - x_min_main_ver=x_min_main[slope_lines==1] - x_max_main_ver=x_max_main[slope_lines==1] - cx_main_ver=cx_main[slope_lines==1] - dist_y_ver=y_max_main_ver-y_min_main_ver + args=np.arange(len(slope_seps)) + args_ver=args[slope_seps==1] + dist_x_ver=dist_x[slope_seps==1] + y_min_seps_ver=y_min_seps[slope_seps==1] + y_max_seps_ver=y_max_seps[slope_seps==1] + x_min_seps_ver=x_min_seps[slope_seps==1] + x_max_seps_ver=x_max_seps[slope_seps==1] + cx_seps_ver=cx_seps[slope_seps==1] + dist_y_ver=y_max_seps_ver-y_min_seps_ver len_y=separators_closeup.shape[0]/3.0 _, thresh = cv2.threshold(horizontal, 0, 255, 0) - contours_line_hors, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - slope_lines, dist_x, x_min_main, x_max_main, cy_main, slope_lines_org, y_min_main, y_max_main, cx_main = \ - find_features_of_lines(contours_line_hors) + contours_sep_hors, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + slope_seps, dist_x, x_min_seps, x_max_seps, cy_seps, slope_seps_org, y_min_seps, y_max_seps, cx_seps = \ + find_features_of_lines(contours_sep_hors) - slope_lines_org_hor=slope_lines_org[slope_lines==0] - args=np.arange(len(slope_lines)) + slope_seps_org_hor=slope_seps_org[slope_seps==0] + args=np.arange(len(slope_seps)) len_x=separators_closeup.shape[1]/5.0 - dist_y=np.abs(y_max_main-y_min_main) + dist_y=np.abs(y_max_seps-y_min_seps) - args_hor=args[slope_lines==0] - dist_x_hor=dist_x[slope_lines==0] - y_min_main_hor=y_min_main[slope_lines==0] - y_max_main_hor=y_max_main[slope_lines==0] - x_min_main_hor=x_min_main[slope_lines==0] - x_max_main_hor=x_max_main[slope_lines==0] - dist_y_hor=dist_y[slope_lines==0] - cy_main_hor=cy_main[slope_lines==0] + args_hor=args[slope_seps==0] + dist_x_hor=dist_x[slope_seps==0] + y_min_seps_hor=y_min_seps[slope_seps==0] + y_max_seps_hor=y_max_seps[slope_seps==0] + x_min_seps_hor=x_min_seps[slope_seps==0] + x_max_seps_hor=x_max_seps[slope_seps==0] + dist_y_hor=dist_y[slope_seps==0] + cy_seps_hor=cy_seps[slope_seps==0] args_hor=args_hor[dist_x_hor>=len_x/2.0] - x_max_main_hor=x_max_main_hor[dist_x_hor>=len_x/2.0] - x_min_main_hor=x_min_main_hor[dist_x_hor>=len_x/2.0] - cy_main_hor=cy_main_hor[dist_x_hor>=len_x/2.0] - y_min_main_hor=y_min_main_hor[dist_x_hor>=len_x/2.0] - y_max_main_hor=y_max_main_hor[dist_x_hor>=len_x/2.0] + x_max_seps_hor=x_max_seps_hor[dist_x_hor>=len_x/2.0] + x_min_seps_hor=x_min_seps_hor[dist_x_hor>=len_x/2.0] + cy_seps_hor=cy_seps_hor[dist_x_hor>=len_x/2.0] + y_min_seps_hor=y_min_seps_hor[dist_x_hor>=len_x/2.0] + y_max_seps_hor=y_max_seps_hor[dist_x_hor>=len_x/2.0] dist_y_hor=dist_y_hor[dist_x_hor>=len_x/2.0] - slope_lines_org_hor=slope_lines_org_hor[dist_x_hor>=len_x/2.0] + slope_seps_org_hor=slope_seps_org_hor[dist_x_hor>=len_x/2.0] dist_x_hor=dist_x_hor[dist_x_hor>=len_x/2.0] - matrix_of_lines_ch=np.zeros((len(cy_main_hor)+len(cx_main_ver),10)) - matrix_of_lines_ch[:len(cy_main_hor),0]=args_hor - matrix_of_lines_ch[len(cy_main_hor):,0]=args_ver - matrix_of_lines_ch[len(cy_main_hor):,1]=cx_main_ver - matrix_of_lines_ch[:len(cy_main_hor),2]=x_min_main_hor+50#x_min_main_hor+150 - matrix_of_lines_ch[len(cy_main_hor):,2]=x_min_main_ver - matrix_of_lines_ch[:len(cy_main_hor),3]=x_max_main_hor-50#x_max_main_hor-150 - matrix_of_lines_ch[len(cy_main_hor):,3]=x_max_main_ver - matrix_of_lines_ch[:len(cy_main_hor),4]=dist_x_hor - matrix_of_lines_ch[len(cy_main_hor):,4]=dist_x_ver - matrix_of_lines_ch[:len(cy_main_hor),5]=cy_main_hor - matrix_of_lines_ch[:len(cy_main_hor),6]=y_min_main_hor - matrix_of_lines_ch[len(cy_main_hor):,6]=y_min_main_ver - matrix_of_lines_ch[:len(cy_main_hor),7]=y_max_main_hor - matrix_of_lines_ch[len(cy_main_hor):,7]=y_max_main_ver - matrix_of_lines_ch[:len(cy_main_hor),8]=dist_y_hor - matrix_of_lines_ch[len(cy_main_hor):,8]=dist_y_ver - matrix_of_lines_ch[len(cy_main_hor):,9]=1 + matrix_of_seps_ch=np.zeros((len(cy_seps_hor)+len(cx_seps_ver),10)) + matrix_of_seps_ch[:len(cy_seps_hor),0]=args_hor + matrix_of_seps_ch[len(cy_seps_hor):,0]=args_ver + matrix_of_seps_ch[len(cy_seps_hor):,1]=cx_seps_ver + matrix_of_seps_ch[:len(cy_seps_hor),2]=x_min_seps_hor+50#x_min_seps_hor+150 + matrix_of_seps_ch[len(cy_seps_hor):,2]=x_min_seps_ver + matrix_of_seps_ch[:len(cy_seps_hor),3]=x_max_seps_hor-50#x_max_seps_hor-150 + matrix_of_seps_ch[len(cy_seps_hor):,3]=x_max_seps_ver + matrix_of_seps_ch[:len(cy_seps_hor),4]=dist_x_hor + matrix_of_seps_ch[len(cy_seps_hor):,4]=dist_x_ver + matrix_of_seps_ch[:len(cy_seps_hor),5]=cy_seps_hor + matrix_of_seps_ch[:len(cy_seps_hor),6]=y_min_seps_hor + matrix_of_seps_ch[len(cy_seps_hor):,6]=y_min_seps_ver + matrix_of_seps_ch[:len(cy_seps_hor),7]=y_max_seps_hor + matrix_of_seps_ch[len(cy_seps_hor):,7]=y_max_seps_ver + matrix_of_seps_ch[:len(cy_seps_hor),8]=dist_y_hor + matrix_of_seps_ch[len(cy_seps_hor):,8]=dist_y_ver + matrix_of_seps_ch[len(cy_seps_hor):,9]=1 if contours_h is not None: - _, dist_x_head, x_min_main_head, x_max_main_head, cy_main_head, _, y_min_main_head, y_max_main_head, _ = \ + _, dist_x_head, x_min_head, x_max_head, cy_head, _, y_min_head, y_max_head, _ = \ find_features_of_lines(contours_h) - matrix_l_n=np.zeros((matrix_of_lines_ch.shape[0]+len(cy_main_head),matrix_of_lines_ch.shape[1])) - matrix_l_n[:matrix_of_lines_ch.shape[0],:]=np.copy(matrix_of_lines_ch[:,:]) - args_head=np.arange(len(cy_main_head)) + len(cy_main_hor) + matrix_l_n = np.zeros((len(cy_head), matrix_of_seps_ch.shape[1])) + args_head = np.arange(len(cy_head)) + matrix_l_n[:, 0] = args_head + matrix_l_n[:, 2] = x_min_head+30 + matrix_l_n[:, 3] = x_max_head-30 + matrix_l_n[:, 4] = dist_x_head + matrix_l_n[:, 5] = y_min_head-3-8 + matrix_l_n[:, 6] = y_min_head-5-8 + matrix_l_n[:, 7] = y_max_head#y_min_head+1-8 + matrix_l_n[:, 8] = 4 + matrix_of_seps_ch = np.append( + matrix_of_seps_ch, matrix_l_n, axis=0) - matrix_l_n[matrix_of_lines_ch.shape[0]:,0]=args_head - matrix_l_n[matrix_of_lines_ch.shape[0]:,2]=x_min_main_head+30 - matrix_l_n[matrix_of_lines_ch.shape[0]:,3]=x_max_main_head-30 - matrix_l_n[matrix_of_lines_ch.shape[0]:,4]=dist_x_head - matrix_l_n[matrix_of_lines_ch.shape[0]:,5]=y_min_main_head-3-8 - matrix_l_n[matrix_of_lines_ch.shape[0]:,6]=y_min_main_head-5-8 - matrix_l_n[matrix_of_lines_ch.shape[0]:,7]=y_max_main_head#y_min_main_head+1-8 - matrix_l_n[matrix_of_lines_ch.shape[0]:,8]=4 - matrix_of_lines_ch=np.copy(matrix_l_n) + cy_seps_splitters=cy_seps_hor[(x_min_seps_hor<=.16*region_pre_p.shape[1]) & + (x_max_seps_hor>=.84*region_pre_p.shape[1])] + cy_seps_splitters = np.append(cy_seps_splitters, special_separators) - cy_main_splitters=cy_main_hor[(x_min_main_hor<=.16*region_pre_p.shape[1]) & - (x_max_main_hor>=.84*region_pre_p.shape[1])] - cy_main_splitters=np.array( list(cy_main_splitters)+list(special_separators)) if contours_h is not None: - try: - cy_main_splitters_head=cy_main_head[(x_min_main_head<=.16*region_pre_p.shape[1]) & - (x_max_main_head>=.84*region_pre_p.shape[1])] - cy_main_splitters=np.array( list(cy_main_splitters)+list(cy_main_splitters_head)) - except: - pass - args_cy_splitter=np.argsort(cy_main_splitters) - cy_main_splitters_sort=cy_main_splitters[args_cy_splitter] + cy_seps_splitters_head=cy_head[(x_min_head<=.16*region_pre_p.shape[1]) & + (x_max_head>=.84*region_pre_p.shape[1])] + cy_seps_splitters = np.append(cy_seps_splitters, cy_seps_splitters_head) - splitter_y_new=[] - splitter_y_new.append(0) - for i in range(len(cy_main_splitters_sort)): - splitter_y_new.append( cy_main_splitters_sort[i] ) - splitter_y_new.append(region_pre_p.shape[0]) - splitter_y_new_diff=np.diff(splitter_y_new)/float(region_pre_p.shape[0])*100 + cy_seps_splitters = np.sort(cy_seps_splitters) + splitter_y_new = [0] + list(cy_seps_splitters) + [region_pre_p.shape[0]] + splitter_y_new_diff = np.diff(splitter_y_new) / float(region_pre_p.shape[0]) * 100 args_big_parts=np.arange(len(splitter_y_new_diff))[ splitter_y_new_diff>22 ] @@ -1573,7 +1547,7 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, peaks_neg_fin=peaks_neg_fin[peaks_neg_fin<=(vertical.shape[1]-500)] peaks_neg_fin_fin=peaks_neg_fin[:] - return num_col_fin, peaks_neg_fin_fin,matrix_of_lines_ch,splitter_y_new,separators_closeup_n + return num_col_fin, peaks_neg_fin_fin, matrix_of_seps_ch, splitter_y_new, separators_closeup_n def return_boxes_of_images_by_order_of_reading_new( splitter_y_new, regions_without_separators, From 5a0e4c3b0f2e089acff0b4fbf058f1d2e6f90f66 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 20 Oct 2025 13:36:10 +0200 Subject: [PATCH 07/91] `find_number_of_columns_in_document`: improve splitter rule extend horizontal separators to full img width if they do not overlap any other regions (only as regards to returned `splitter_y` result, but without changing returned separators mask) --- src/eynollah/utils/__init__.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 168899f..b930bfd 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1378,6 +1378,8 @@ def return_points_with_boundies(peaks_neg_fin, first_point, last_point): return peaks_neg_tot def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, label_seps, contours_h=None): + ncomps, ccomps = cv2.connectedComponents(region_pre_p.astype(np.uint8)) + separators_closeup = 1 * (region_pre_p == label_seps) separators_closeup[0:110] = 0 separators_closeup[-150:] = 0 @@ -1398,10 +1400,19 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, min_xe = cnt[:, 0, 0].min() max_ye = cnt[:, 0, 1].max() min_ye = cnt[:, 0, 1].min() + med_ye = int(np.median(cnt[:, 0, 1])) dist_xe = max_xe - min_xe dist_ye = max_ye - min_ye if dist_ye <= 50 and dist_xe >= 3 * dist_ye: cnts_hor_e.append(cnt) + labels = np.setdiff1d(np.unique(ccomps[med_ye]), [0]) + if len(labels) == 1: + # mid line does not intersect with any other region + # so add it as extra splitter line + cnts_hor_e.append(np.array([[[0, med_ye]], + [[ccomps.shape[1], med_ye]], + [[ccomps.shape[1], med_ye + 1]], + [[0, med_ye + 1]]])) # delete horizontal contours (leaving only the edges) separators_closeup_n_binary = cv2.fillPoly(separators_closeup_n_binary, pts=cnts_hor_e, color=0) From cd35241e816acc7e2083dc31d99f376a8877904b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 20 Oct 2025 13:41:36 +0200 Subject: [PATCH 08/91] `find_number_of_columns_in_document`: split headings at top+baseline regarding `splitter_y` result, for headings, instead of cutting right through them via center line, add their toplines and baselines as if they were horizontal separators --- src/eynollah/utils/__init__.py | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index b930bfd..0c3e4ae 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1506,15 +1506,33 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, if contours_h is not None: _, dist_x_head, x_min_head, x_max_head, cy_head, _, y_min_head, y_max_head, _ = \ find_features_of_lines(contours_h) + # matrix_l_n = np.zeros((len(cy_head), matrix_of_seps_ch.shape[1])) + # args_head = np.arange(len(cy_head)) + # matrix_l_n[:, 0] = args_head + # matrix_l_n[:, 2] = x_min_head+30 + # matrix_l_n[:, 3] = x_max_head-30 + # matrix_l_n[:, 4] = dist_x_head + # matrix_l_n[:, 5] = y_min_head-3-8 + # matrix_l_n[:, 6] = y_min_head-5-8 + # matrix_l_n[:, 7] = y_max_head#y_min_head+1-8 + # matrix_l_n[:, 8] = 4 + # split at toplines (y_min_head) and baselines (y_max_head) instead of center (cy_head): + cy_head = np.stack((y_min_head, y_max_head)).T.flatten() + y_min_head, y_max_head = (np.stack((y_min_head - 2, y_max_head - 2)).T.flatten(), + np.stack((y_min_head + 2, y_max_head + 2)).T.flatten()) + x_min_head = np.repeat(x_min_head, 2) + x_max_head = np.repeat(x_max_head, 2) + dist_x_head = np.repeat(dist_x_head, 2) matrix_l_n = np.zeros((len(cy_head), matrix_of_seps_ch.shape[1])) args_head = np.arange(len(cy_head)) matrix_l_n[:, 0] = args_head - matrix_l_n[:, 2] = x_min_head+30 - matrix_l_n[:, 3] = x_max_head-30 + # +/- 30px to avoid crossing col peaks by accident + matrix_l_n[:, 2] = x_min_head + 30 + matrix_l_n[:, 3] = x_max_head - 30 matrix_l_n[:, 4] = dist_x_head - matrix_l_n[:, 5] = y_min_head-3-8 - matrix_l_n[:, 6] = y_min_head-5-8 - matrix_l_n[:, 7] = y_max_head#y_min_head+1-8 + matrix_l_n[:, 5] = cy_head + matrix_l_n[:, 6] = y_min_head + matrix_l_n[:, 7] = y_max_head matrix_l_n[:, 8] = 4 matrix_of_seps_ch = np.append( matrix_of_seps_ch, matrix_l_n, axis=0) From 7c3e41858877211c82f5b6c91a02fccfe146cacb Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 20 Oct 2025 16:13:51 +0200 Subject: [PATCH 09/91] `return_boxes_of_images_by_order_of_reading_new`: simplify - enumeration instead of indexing - array instead of list operations - add better plotting (but commented out) --- src/eynollah/utils/__init__.py | 349 ++++++++++++++++----------------- 1 file changed, 165 insertions(+), 184 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 0c3e4ae..698b0bd 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -5,6 +5,7 @@ import math try: import matplotlib.pyplot as plt + import matplotlib.patches as patches except ImportError: plt = None import numpy as np @@ -20,6 +21,7 @@ from .contour import (contours_in_same_horizon, return_contours_of_image, return_parent_contours) + def pairwise(iterable): # pairwise('ABCDEFG') → AB BC CD DE EF FG @@ -205,15 +207,15 @@ def return_x_start_end_mothers_childs_and_type_of_reading_order( #print(x_end,'x_end') #print(len_sep) - deleted=[] + deleted = set() for i in range(len(x_start)-1): nodes_i=set(range(x_start[i],x_end[i]+1)) for j in range(i+1,len(x_start)): if nodes_i==set(range(x_start[j],x_end[j]+1)): - deleted.append(j) + deleted.add(j) #print(np.unique(deleted)) - remained_sep_indexes=set(range(len(x_start)))-set(np.unique(deleted) ) + remained_sep_indexes = set(range(len(x_start))) - deleted #print(remained_sep_indexes,'remained_sep_indexes') mother=[]#if it has mother child=[] @@ -262,7 +264,7 @@ def return_x_start_end_mothers_childs_and_type_of_reading_order( x_start_with_child_without_mother = x_start[remained_sep_indexes_with_child_without_mother] y_lines_with_child_without_mother = y_sep[remained_sep_indexes_with_child_without_mother] - reading_orther_type=0 + reading_order_type=0 x_end_without_mother = x_end[remained_sep_indexes_without_mother] x_start_without_mother = x_start[remained_sep_indexes_without_mother] y_lines_without_mother = y_sep[remained_sep_indexes_without_mother] @@ -278,12 +280,11 @@ def return_x_start_end_mothers_childs_and_type_of_reading_order( x_end[remained_sep_indexes_without_mother[j]] # + 1 )) - set_diff = nodes_i - nodes_j - if set_diff != nodes_i: - reading_orther_type = 1 + if nodes_i - nodes_j != nodes_i: + reading_order_type = 1 else: - reading_orther_type = 0 - #print(reading_orther_type,'javab') + reading_order_type = 0 + #print(reading_order_type,'javab') #print(y_lines_with_child_without_mother,'y_lines_with_child_without_mother') #print(x_start_with_child_without_mother,'x_start_with_child_without_mother') #print(x_end_with_child_without_mother,'x_end_with_hild_without_mother') @@ -297,7 +298,7 @@ def return_x_start_end_mothers_childs_and_type_of_reading_order( #print(all_args_uniq,'all_args_uniq') #print(args_to_be_unified,'args_to_be_unified') - return (reading_orther_type, + return (reading_order_type, x_start_returned, x_end_returned, y_sep_returned, @@ -1590,77 +1591,90 @@ def return_boxes_of_images_by_order_of_reading_new( if logger is None: logger = getLogger(__package__) logger.debug('enter return_boxes_of_images_by_order_of_reading_new') + # def dbg_plt(box=None, title=None): + # if box is None: + # box = [None, None, None, None] + # img = regions_without_separators[box[2]:box[3], box[0]:box[1]] + # plt.imshow(img) + # xrange = np.arange(0, img.shape[1], 100) + # yrange = np.arange(0, img.shape[0], 100) + # plt.gca().set_xticks(xrange, xrange + (box[0] or 0)) + # plt.gca().set_yticks(yrange, yrange + (box[2] or 0)) + # if title: + # plt.title(title) + # plt.show() + # dbg_plt() boxes=[] peaks_neg_tot_tables = [] splitter_y_new = np.array(splitter_y_new, dtype=int) - for i in range(len(splitter_y_new)-1): - #print(splitter_y_new[i],splitter_y_new[i+1]) - matrix_new = matrix_of_lines_ch[:,:][(matrix_of_lines_ch[:,6]> splitter_y_new[i] ) & - (matrix_of_lines_ch[:,7]< splitter_y_new[i+1] )] + width_tot = regions_without_separators.shape[1] + for top, bot in pairwise(splitter_y_new): + # print("%d:%d" % (top, bot), 'i') + # dbg_plt([None, None, top, bot], + # "image cut for y split %d:%d" % ( + # top, bot)) + matrix_new = matrix_of_lines_ch[(matrix_of_lines_ch[:,6] > top) & + (matrix_of_lines_ch[:,7] < bot)] #print(len( matrix_new[:,9][matrix_new[:,9]==1] )) #print(matrix_new[:,8][matrix_new[:,9]==1],'gaddaaa') # check to see is there any vertical separator to find holes. #if (len(matrix_new[:,9][matrix_new[:,9]==1]) > 0 and # np.max(matrix_new[:,8][matrix_new[:,9]==1]) >= - # 0.1 * (np.abs(splitter_y_new[i+1]-splitter_y_new[i]))): + # 0.1 * (np.abs(bot-top))): if True: try: num_col, peaks_neg_fin = find_num_col( - regions_without_separators[splitter_y_new[i]:splitter_y_new[i+1], :], + regions_without_separators[top:bot], num_col_classifier, tables, multiplier=6. if erosion_hurts else 7.) except: peaks_neg_fin=[] num_col = 0 try: if (len(peaks_neg_fin)+1)=len(peaks_neg_fin2): - peaks_neg_fin=list(np.copy(peaks_neg_fin1)) + peaks_neg_fin2 = [] + if len(peaks_neg_fin1) >= len(peaks_neg_fin2): + peaks_neg_fin = peaks_neg_fin1 else: - peaks_neg_fin=list(np.copy(peaks_neg_fin2)) - peaks_neg_fin=list(np.array(peaks_neg_fin)+peaks_neg_fin_early[i_n]) - - if i_n!=(len(peaks_neg_fin_early)-2): - peaks_neg_fin_rev.append(peaks_neg_fin_early[i_n+1]) + peaks_neg_fin = peaks_neg_fin2 + peaks_neg_fin = list(np.array(peaks_neg_fin) + left) #print(peaks_neg_fin,'peaks_neg_fin') - peaks_neg_fin_rev=peaks_neg_fin_rev+peaks_neg_fin + + if right < peaks_neg_fin_early[-1]: + peaks_neg_fin_rev.append(right) + peaks_neg_fin_rev.extend(peaks_neg_fin) if len(peaks_neg_fin_rev)>=len(peaks_neg_fin_org): peaks_neg_fin=list(np.sort(peaks_neg_fin_rev)) @@ -1673,21 +1687,20 @@ def return_boxes_of_images_by_order_of_reading_new( except: logger.exception("cannot find peaks consistent with columns") #num_col, peaks_neg_fin = find_num_col( - # regions_without_separators[splitter_y_new[i]:splitter_y_new[i+1],:], + # regions_without_separators[top:bot,:], # multiplier=7.0) x_min_hor_some=matrix_new[:,2][ (matrix_new[:,9]==0) ] x_max_hor_some=matrix_new[:,3][ (matrix_new[:,9]==0) ] cy_hor_some=matrix_new[:,5][ (matrix_new[:,9]==0) ] cy_hor_diff=matrix_new[:,7][ (matrix_new[:,9]==0) ] - arg_org_hor_some=matrix_new[:,0][ (matrix_new[:,9]==0) ] if right2left_readingorder: - x_max_hor_some_new = regions_without_separators.shape[1] - x_min_hor_some - x_min_hor_some_new = regions_without_separators.shape[1] - x_max_hor_some + x_max_hor_some_new = width_tot - x_min_hor_some + x_min_hor_some_new = width_tot - x_max_hor_some x_min_hor_some =list(np.copy(x_min_hor_some_new)) x_max_hor_some =list(np.copy(x_max_hor_some_new)) - peaks_neg_tot=return_points_with_boundies(peaks_neg_fin,0, regions_without_separators[:,:].shape[1]) + peaks_neg_tot = [0] + peaks_neg_fin + [width_tot] peaks_neg_tot_tables.append(peaks_neg_tot) reading_order_type, x_starting, x_ending, y_type_2, y_diff_type_2, \ @@ -1697,26 +1710,27 @@ def return_boxes_of_images_by_order_of_reading_new( x_min_hor_some, x_max_hor_some, cy_hor_some, peaks_neg_tot, cy_hor_diff) all_columns = set(range(len(peaks_neg_tot) - 1)) - if ((reading_order_type==1) or - (reading_order_type==0 and - (len(y_lines_without_mother)>=2 or there_is_sep_with_child==1))): + # print("all_columns", all_columns) + if (reading_order_type == 1 or + len(y_lines_without_mother) >= 2 or + there_is_sep_with_child == 1): try: - y_grenze = splitter_y_new[i] + 300 + y_grenze = top + 300 #check if there is a big separator in this y_mains_sep_ohne_grenzen args_early_ys=np.arange(len(y_type_2)) #print(args_early_ys,'args_early_ys') - #print(splitter_y_new[i], splitter_y_new[i+1]) + #print(top, bot) - x_starting_up = x_starting[(y_type_2 > splitter_y_new[i]) & + x_starting_up = x_starting[(y_type_2 > top) & (y_type_2 <= y_grenze)] - x_ending_up = x_ending[(y_type_2 > splitter_y_new[i]) & + x_ending_up = x_ending[(y_type_2 > top) & (y_type_2 <= y_grenze)] - y_type_2_up = y_type_2[(y_type_2 > splitter_y_new[i]) & + y_type_2_up = y_type_2[(y_type_2 > top) & (y_type_2 <= y_grenze)] - y_diff_type_2_up = y_diff_type_2[(y_type_2 > splitter_y_new[i]) & + y_diff_type_2_up = y_diff_type_2[(y_type_2 > top) & (y_type_2 <= y_grenze)] - args_up = args_early_ys[(y_type_2 > splitter_y_new[i]) & + args_up = args_early_ys[(y_type_2 > top) & (y_type_2 <= y_grenze)] if len(y_type_2_up) > 0: y_main_separator_up = y_type_2_up [(x_starting_up==0) & @@ -1730,27 +1744,28 @@ def return_boxes_of_images_by_order_of_reading_new( args_to_be_kept = np.array(list( set(args_early_ys) - set(args_main_to_deleted) )) #print(args_to_be_kept,'args_to_be_kept') boxes.append([0, peaks_neg_tot[len(peaks_neg_tot)-1], - splitter_y_new[i], y_diff_main_separator_up.max()]) - splitter_y_new[i] = y_diff_main_separator_up.max() + top, y_diff_main_separator_up.max()]) + # dbg_plt(boxes[-1], "first box") + top = y_diff_main_separator_up.max() - #print(splitter_y_new[i],'splitter_y_new[i]') + #print(top,'top') y_type_2 = y_type_2[args_to_be_kept] x_starting = x_starting[args_to_be_kept] x_ending = x_ending[args_to_be_kept] y_diff_type_2 = y_diff_type_2[args_to_be_kept] #print('galdiha') - y_grenze = splitter_y_new[i] + 200 + y_grenze = top + 200 args_early_ys2=np.arange(len(y_type_2)) - y_type_2_up=y_type_2[(y_type_2 > splitter_y_new[i]) & + y_type_2_up=y_type_2[(y_type_2 > top) & (y_type_2 <= y_grenze)] - x_starting_up=x_starting[(y_type_2 > splitter_y_new[i]) & + x_starting_up=x_starting[(y_type_2 > top) & (y_type_2 <= y_grenze)] - x_ending_up=x_ending[(y_type_2 > splitter_y_new[i]) & + x_ending_up=x_ending[(y_type_2 > top) & (y_type_2 <= y_grenze)] - y_diff_type_2_up=y_diff_type_2[(y_type_2 > splitter_y_new[i]) & + y_diff_type_2_up=y_diff_type_2[(y_type_2 > top) & (y_type_2 <= y_grenze)] - args_up2=args_early_ys2[(y_type_2 > splitter_y_new[i]) & + args_up2=args_early_ys2[(y_type_2 > top) & (y_type_2 <= y_grenze)] #print(y_type_2_up,x_starting_up,x_ending_up,'didid') nodes_in = set() @@ -1804,13 +1819,14 @@ def return_boxes_of_images_by_order_of_reading_new( pass #print('burdaydikh2') - #int(splitter_y_new[i]) + #int(top) y_lines_by_order=[] x_start_by_order=[] x_end_by_order=[] - if (len(x_end_with_child_without_mother)==0 and reading_order_type==0) or reading_order_type==1: - if reading_order_type==1: - y_lines_by_order.append(splitter_y_new[i]) + if (reading_order_type == 1 or + len(x_end_with_child_without_mother) == 0): + if reading_order_type == 1: + y_lines_by_order.append(top) x_start_by_order.append(0) x_end_by_order.append(len(peaks_neg_tot)-2) else: @@ -1823,8 +1839,8 @@ def return_boxes_of_images_by_order_of_reading_new( columns_not_covered = list(all_columns - columns_covered_by_mothers) y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered) + len(x_start_without_mother), - dtype=int) * splitter_y_new[i]) - ##y_lines_by_order = np.append(y_lines_by_order, [splitter_y_new[i]] * len(columns_not_covered)) + dtype=int) * top) + ##y_lines_by_order = np.append(y_lines_by_order, [top] * len(columns_not_covered)) ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) x_starting = np.append(x_starting, np.array(columns_not_covered, int)) x_starting = np.append(x_starting, x_start_without_mother) @@ -1839,22 +1855,15 @@ def return_boxes_of_images_by_order_of_reading_new( ind_args_in_col=ind_args[x_starting==column] #print('babali2') #print(ind_args_in_col,'ind_args_in_col') - ind_args_in_col=np.array(ind_args_in_col) #print(len(y_type_2)) y_column=y_type_2[ind_args_in_col] x_start_column=x_starting[ind_args_in_col] x_end_column=x_ending[ind_args_in_col] #print('babali3') ind_args_col_sorted=np.argsort(y_column) - y_col_sort=y_column[ind_args_col_sorted] - x_start_column_sort=x_start_column[ind_args_col_sorted] - x_end_column_sort=x_end_column[ind_args_col_sorted] - #print('babali4') - for ii in range(len(y_col_sort)): - #print('babali5') - y_lines_by_order.append(y_col_sort[ii]) - x_start_by_order.append(x_start_column_sort[ii]) - x_end_by_order.append(x_end_column_sort[ii]-1) + y_lines_by_order.extend(y_column[ind_args_col_sorted]) + x_start_by_order.extend(x_start_column[ind_args_col_sorted]) + x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) else: #print(x_start_without_mother,x_end_without_mother,peaks_neg_tot,'dodo') columns_covered_by_mothers = set() @@ -1864,8 +1873,8 @@ def return_boxes_of_images_by_order_of_reading_new( x_end_without_mother[dj])) columns_not_covered = list(all_columns - columns_covered_by_mothers) y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered) + len(x_start_without_mother), - dtype=int) * splitter_y_new[i]) - ##y_lines_by_order = np.append(y_lines_by_order, [splitter_y_new[i]] * len(columns_not_covered)) + dtype=int) * top) + ##y_lines_by_order = np.append(y_lines_by_order, [top] * len(columns_not_covered)) ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) x_starting = np.append(x_starting, np.array(columns_not_covered, int)) x_starting = np.append(x_starting, x_start_without_mother) @@ -1888,25 +1897,24 @@ def return_boxes_of_images_by_order_of_reading_new( x_start_with_child_without_mother = np.array(x_start_with_child_without_mother, int) for i_s_nc in columns_not_covered_child_no_mother: if i_s_nc in x_start_with_child_without_mother: + #print("i_s_nc", i_s_nc) x_end_biggest_column = \ x_end_with_child_without_mother[x_start_with_child_without_mother==i_s_nc][0] args_all_biggest_lines = ind_args[(x_starting==i_s_nc) & (x_ending==x_end_biggest_column)] y_column_nc = y_type_2[args_all_biggest_lines] - x_start_column_nc = x_starting[args_all_biggest_lines] - x_end_column_nc = x_ending[args_all_biggest_lines] + #x_start_column_nc = x_starting[args_all_biggest_lines] + #x_end_column_nc = x_ending[args_all_biggest_lines] y_column_nc = np.sort(y_column_nc) for i_c in range(len(y_column_nc)): - if i_c==(len(y_column_nc)-1): - ind_all_lines_between_nm_wc=ind_args[(y_type_2>y_column_nc[i_c]) & - (y_type_2=i_s_nc) & - (x_ending<=x_end_biggest_column)] - else: - ind_all_lines_between_nm_wc=ind_args[(y_type_2>y_column_nc[i_c]) & - (y_type_2=i_s_nc) & - (x_ending<=x_end_biggest_column)] + #print("i_c", i_c) + ind_all_lines_between_nm_wc = \ + ind_args[(y_type_2 > y_column_nc[i_c]) & + (y_type_2 < (y_column_nc[i_c+1] + if i_c < len(y_column_nc)-1 + else bot)) & + (x_starting >= i_s_nc) & + (x_ending <= x_end_biggest_column)] y_all_between_nm_wc = y_type_2[ind_all_lines_between_nm_wc] x_starting_all_between_nm_wc = x_starting[ind_all_lines_between_nm_wc] x_ending_all_between_nm_wc = x_ending[ind_all_lines_between_nm_wc] @@ -1965,78 +1973,58 @@ def return_boxes_of_images_by_order_of_reading_new( ind_args_in_col=ind_args_between[x_starting_all_between_nm_wc==column] #print('babali2') #print(ind_args_in_col,'ind_args_in_col') - ind_args_in_col=np.array(ind_args_in_col) #print(len(y_type_2)) y_column=y_all_between_nm_wc[ind_args_in_col] x_start_column=x_starting_all_between_nm_wc[ind_args_in_col] x_end_column=x_ending_all_between_nm_wc[ind_args_in_col] #print('babali3') ind_args_col_sorted=np.argsort(y_column) - y_col_sort=y_column[ind_args_col_sorted] - x_start_column_sort=x_start_column[ind_args_col_sorted] - x_end_column_sort=x_end_column[ind_args_col_sorted] - #print('babali4') - for ii in range(len(y_col_sort)): - #print('babali5') - y_lines_by_order.append(y_col_sort[ii]) - x_start_by_order.append(x_start_column_sort[ii]) - x_end_by_order.append(x_end_column_sort[ii]-1) + y_lines_by_order.extend(y_column[ind_args_col_sorted]) + x_start_by_order.extend(x_start_column[ind_args_col_sorted]) + x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) else: #print(column,'column') ind_args_in_col=ind_args[x_starting==i_s_nc] #print('babali2') #print(ind_args_in_col,'ind_args_in_col') - ind_args_in_col=np.array(ind_args_in_col) #print(len(y_type_2)) y_column=y_type_2[ind_args_in_col] x_start_column=x_starting[ind_args_in_col] x_end_column=x_ending[ind_args_in_col] #print('babali3') - ind_args_col_sorted=np.argsort(y_column) - y_col_sort=y_column[ind_args_col_sorted] - x_start_column_sort=x_start_column[ind_args_col_sorted] - x_end_column_sort=x_end_column[ind_args_col_sorted] - #print('babali4') - for ii in range(len(y_col_sort)): - y_lines_by_order.append(y_col_sort[ii]) - x_start_by_order.append(x_start_column_sort[ii]) - x_end_by_order.append(x_end_column_sort[ii]-1) + ind_args_col_sorted = np.argsort(y_column) + y_lines_by_order.extend(y_column[ind_args_col_sorted]) + x_start_by_order.extend(x_start_column[ind_args_col_sorted]) + x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) + y_lines_by_order = np.array(y_lines_by_order) + x_start_by_order = np.array(x_start_by_order) + x_end_by_order = np.array(x_end_by_order) for il in range(len(y_lines_by_order)): - y_copy = list(y_lines_by_order) - x_start_copy = list(x_start_by_order) - x_end_copy = list(x_end_by_order) - - #print(y_copy,'y_copy') - y_itself=y_copy.pop(il) - x_start_itself=x_start_copy.pop(il) - x_end_itself=x_end_copy.pop(il) - - #print(y_copy,'y_copy2') + #print(il, "il") + y_itself = y_lines_by_order[il] + x_start_itself = x_start_by_order[il] + x_end_itself = x_end_by_order[il] for column in range(int(x_start_itself), int(x_end_itself)+1): #print(column,'cols') - y_in_cols=[] - for yic in range(len(y_copy)): - #print('burda') - if (y_copy[yic]>y_itself and - column>=x_start_copy[yic] and - column<=x_end_copy[yic]): - y_in_cols.append(y_copy[yic]) + y_in_cols = y_lines_by_order[(y_itself < y_lines_by_order) & + (column >= x_start_by_order) & + (column <= x_end_by_order)] + #print('burda') + y_down = y_in_cols.min(initial=bot) #print('burda2') #print(y_in_cols,'y_in_cols') - if len(y_in_cols)>0: - y_down=np.min(y_in_cols) - else: - y_down=splitter_y_new[i+1] #print(y_itself,'y_itself') boxes.append([peaks_neg_tot[column], peaks_neg_tot[column+1], y_itself, y_down]) + # dbg_plt(boxes[-1], "A column %d box" % (column + 1)) except: logger.exception("cannot assign boxes") boxes.append([0, peaks_neg_tot[len(peaks_neg_tot)-1], - splitter_y_new[i], splitter_y_new[i+1]]) + top, bot]) + # dbg_plt(boxes[-1], "fallback box") else: y_lines_by_order=[] x_start_by_order=[] @@ -2050,8 +2038,8 @@ def return_boxes_of_images_by_order_of_reading_new( columns_not_covered = list(all_columns - columns_covered_by_lines_covered_more_than_2col) y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered) + 1, - dtype=int) * splitter_y_new[i]) - ##y_lines_by_order = np.append(y_lines_by_order, [splitter_y_new[i]] * len(columns_not_covered)) + dtype=int) * top) + ##y_lines_by_order = np.append(y_lines_by_order, [top] * len(columns_not_covered)) ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype)) x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1) @@ -2064,8 +2052,8 @@ def return_boxes_of_images_by_order_of_reading_new( else: columns_not_covered = list(all_columns) y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered), - dtype=int) * splitter_y_new[i]) - ##y_lines_by_order = np.append(y_lines_by_order, [splitter_y_new[i]] * len(columns_not_covered)) + dtype=int) * top) + ##y_lines_by_order = np.append(y_lines_by_order, [top] * len(columns_not_covered)) ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype)) x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1) @@ -2075,71 +2063,64 @@ def return_boxes_of_images_by_order_of_reading_new( for column in range(len(peaks_neg_tot)-1): #print(column,'column') ind_args_in_col=ind_args[x_starting==column] - ind_args_in_col=np.array(ind_args_in_col) #print(len(y_type_2)) y_column=y_type_2[ind_args_in_col] x_start_column=x_starting[ind_args_in_col] x_end_column=x_ending[ind_args_in_col] - ind_args_col_sorted=np.argsort(y_column) - y_col_sort=y_column[ind_args_col_sorted] - x_start_column_sort=x_start_column[ind_args_col_sorted] - x_end_column_sort=x_end_column[ind_args_col_sorted] - #print('babali4') - for ii in range(len(y_col_sort)): - #print('babali5') - y_lines_by_order.append(y_col_sort[ii]) - x_start_by_order.append(x_start_column_sort[ii]) - x_end_by_order.append(x_end_column_sort[ii]-1) + ind_args_col_sorted = np.argsort(y_column) + y_lines_by_order.extend(y_column[ind_args_col_sorted]) + x_start_by_order.extend(x_start_column[ind_args_col_sorted]) + x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) + y_lines_by_order = np.array(y_lines_by_order) + x_start_by_order = np.array(x_start_by_order) + x_end_by_order = np.array(x_end_by_order) for il in range(len(y_lines_by_order)): - y_copy = list(y_lines_by_order) - x_start_copy = list(x_start_by_order) - x_end_copy = list(x_end_by_order) - - #print(y_copy,'y_copy') - y_itself=y_copy.pop(il) - x_start_itself=x_start_copy.pop(il) - x_end_itself=x_end_copy.pop(il) - + #print(il, "il") + y_itself = y_lines_by_order[il] + #print(y_itself,'y_itself') + x_start_itself = x_start_by_order[il] + x_end_itself = x_end_by_order[il] for column in range(x_start_itself, x_end_itself+1): #print(column,'cols') - y_in_cols=[] - for yic in range(len(y_copy)): - #print('burda') - if (y_copy[yic]>y_itself and - column>=x_start_copy[yic] and - column<=x_end_copy[yic]): - y_in_cols.append(y_copy[yic]) + y_in_cols = y_lines_by_order[(y_itself < y_lines_by_order) & + (column >= x_start_by_order) & + (column <= x_end_by_order)] #print('burda2') #print(y_in_cols,'y_in_cols') - if len(y_in_cols)>0: - y_down=np.min(y_in_cols) - else: - y_down=splitter_y_new[i+1] - #print(y_itself,'y_itself') + y_down = y_in_cols.min(initial=bot) + #print(y_down,'y_down') boxes.append([peaks_neg_tot[column], peaks_neg_tot[column+1], y_itself, y_down]) + # dbg_plt(boxes[-1], "B column %d box" % (column + 1)) #else: - #boxes.append([ 0, regions_without_separators[:,:].shape[1] ,splitter_y_new[i],splitter_y_new[i+1]]) + #boxes.append([ 0, regions_without_separators[:,:].shape[1] ,top, bot]) if right2left_readingorder: peaks_neg_tot_tables_new = [] if len(peaks_neg_tot_tables)>=1: for peaks_tab_ind in peaks_neg_tot_tables: - peaks_neg_tot_tables_ind = regions_without_separators.shape[1] - np.array(peaks_tab_ind) + peaks_neg_tot_tables_ind = width_tot - np.array(peaks_tab_ind) peaks_neg_tot_tables_ind = list(peaks_neg_tot_tables_ind[::-1]) peaks_neg_tot_tables_new.append(peaks_neg_tot_tables_ind) for i in range(len(boxes)): - x_start_new = regions_without_separators.shape[1] - boxes[i][1] - x_end_new = regions_without_separators.shape[1] - boxes[i][0] + x_start_new = width_tot - boxes[i][1] + x_end_new = width_tot - boxes[i][0] boxes[i][0] = x_start_new boxes[i][1] = x_end_new peaks_neg_tot_tables = peaks_neg_tot_tables_new + # show final xy-cut + # plt.imshow(regions_without_separators) + # for xmin, xmax, ymin, ymax in boxes: + # plt.gca().add_patch(patches.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, + # fill=False, linewidth=1, edgecolor='r')) + # plt.show() + logger.debug('exit return_boxes_of_images_by_order_of_reading_new') return boxes, peaks_neg_tot_tables From 0fc4b2535dc005612406cd4ffbf2471a5b4e1485 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 20 Oct 2025 16:47:35 +0200 Subject: [PATCH 10/91] `return_boxes_of_images_by_order_of_reading_new`: fix no-mother case - when handling lines without mother, and biggest line already accounts for all columns, but some are too close to the top and therefore must be removed, avoid invalidating `biggest` index, causing `IndexError` - remove try-catch (now unnecessary) - array instead of list operations --- src/eynollah/utils/__init__.py | 62 ++++++++++++++++------------------ 1 file changed, 29 insertions(+), 33 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 698b0bd..b331cab 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1919,54 +1919,50 @@ def return_boxes_of_images_by_order_of_reading_new( x_starting_all_between_nm_wc = x_starting[ind_all_lines_between_nm_wc] x_ending_all_between_nm_wc = x_ending[ind_all_lines_between_nm_wc] - x_diff_all_between_nm_wc = x_ending_all_between_nm_wc - x_starting_all_between_nm_wc - if len(x_diff_all_between_nm_wc)>0: - biggest=np.argmax(x_diff_all_between_nm_wc) - columns_covered_by_mothers = set() - for dj in range(len(x_starting_all_between_nm_wc)): + for dj in range(len(ind_all_lines_between_nm_wc)): columns_covered_by_mothers.update( range(x_starting_all_between_nm_wc[dj], x_ending_all_between_nm_wc[dj])) child_columns = set(range(i_s_nc, x_end_biggest_column)) columns_not_covered = list(child_columns - columns_covered_by_mothers) - should_longest_line_be_extended=0 - if (len(x_diff_all_between_nm_wc) > 0 and - set(list(range(x_starting_all_between_nm_wc[biggest], - x_ending_all_between_nm_wc[biggest])) + - list(columns_not_covered)) != child_columns): - should_longest_line_be_extended=1 - index_lines_so_close_to_top_separator = \ - np.arange(len(y_all_between_nm_wc))[(y_all_between_nm_wc>y_column_nc[i_c]) & - (y_all_between_nm_wc<=(y_column_nc[i_c]+500))] - if len(index_lines_so_close_to_top_separator) > 0: - indexes_remained_after_deleting_closed_lines= \ - np.array(list(set(list(range(len(y_all_between_nm_wc)))) - - set(list(index_lines_so_close_to_top_separator)))) - if len(indexes_remained_after_deleting_closed_lines) > 0: + if len(ind_all_lines_between_nm_wc): + biggest = np.argmax(x_ending_all_between_nm_wc - + x_starting_all_between_nm_wc) + if columns_covered_by_mothers == set( + range(x_starting_all_between_nm_wc[biggest], + x_ending_all_between_nm_wc[biggest])): + # biggest accounts for all columns alone, + # longest line should be extended + lines_so_close_to_top_separator = \ + ((y_all_between_nm_wc > y_column_nc[i_c]) & + (y_all_between_nm_wc <= y_column_nc[i_c] + 500)) + if (np.count_nonzero(lines_so_close_to_top_separator) and + np.count_nonzero(lines_so_close_to_top_separator) < + len(ind_all_lines_between_nm_wc)): y_all_between_nm_wc = \ - y_all_between_nm_wc[indexes_remained_after_deleting_closed_lines] + y_all_between_nm_wc[~lines_so_close_to_top_separator] x_starting_all_between_nm_wc = \ - x_starting_all_between_nm_wc[indexes_remained_after_deleting_closed_lines] + x_starting_all_between_nm_wc[~lines_so_close_to_top_separator] x_ending_all_between_nm_wc = \ - x_ending_all_between_nm_wc[indexes_remained_after_deleting_closed_lines] + x_ending_all_between_nm_wc[~lines_so_close_to_top_separator] - y_all_between_nm_wc = np.append(y_all_between_nm_wc, y_column_nc[i_c]) - x_starting_all_between_nm_wc = np.append(x_starting_all_between_nm_wc, i_s_nc) - x_ending_all_between_nm_wc = np.append(x_ending_all_between_nm_wc, x_end_biggest_column) - - if len(x_diff_all_between_nm_wc) > 0: - try: + y_all_between_nm_wc = np.append(y_all_between_nm_wc, y_column_nc[i_c]) + x_starting_all_between_nm_wc = np.append(x_starting_all_between_nm_wc, i_s_nc) + x_ending_all_between_nm_wc = np.append(x_ending_all_between_nm_wc, x_end_biggest_column) + else: y_all_between_nm_wc = np.append(y_all_between_nm_wc, y_column_nc[i_c]) x_starting_all_between_nm_wc = np.append(x_starting_all_between_nm_wc, x_starting_all_between_nm_wc[biggest]) x_ending_all_between_nm_wc = np.append(x_ending_all_between_nm_wc, x_ending_all_between_nm_wc[biggest]) - except: - logger.exception("cannot append") - y_all_between_nm_wc = np.append(y_all_between_nm_wc, [y_column_nc[i_c]] * len(columns_not_covered)) - x_starting_all_between_nm_wc = np.append(x_starting_all_between_nm_wc, np.array(columns_not_covered, int)) - x_ending_all_between_nm_wc = np.append(x_ending_all_between_nm_wc, np.array(columns_not_covered, int) + 1) + if len(columns_not_covered): + y_all_between_nm_wc = np.append( + y_all_between_nm_wc, [y_column_nc[i_c]] * len(columns_not_covered)) + x_starting_all_between_nm_wc = np.append( + x_starting_all_between_nm_wc, np.array(columns_not_covered, int)) + x_ending_all_between_nm_wc = np.append( + x_ending_all_between_nm_wc, np.array(columns_not_covered, int) + 1) ind_args_between=np.arange(len(x_ending_all_between_nm_wc)) for column in range(int(i_s_nc), int(x_end_biggest_column)): From e2dfec75fbefe3e5aeffd71a7a61eab6092f6c92 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 24 Oct 2025 01:19:20 +0200 Subject: [PATCH 11/91] `return_x_start_end_mothers_childs_and_type_of_reading_order`: simplify and document MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - simplify - rename identifiers to make readable: - `y_sep` → `y_mid` (because the cy gets passed) - `y_diff` → `y_max` (because the ymax gets passed) - array instead of list operations - add docstring and in-line comments - return (zero-length) numpy array instead of empty list --- src/eynollah/eynollah.py | 10 +- src/eynollah/utils/__init__.py | 378 +++++++++++++++++---------------- 2 files changed, 198 insertions(+), 190 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 9412861..08ffed7 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -2507,6 +2507,7 @@ class Eynollah: My_main[ii] < box[3])): arg_text_con_main[ii] = jj check_if_textregion_located_in_a_box = True + #print("main/matched", (mx_main[ii], Mx_main[ii], my_main[ii], My_main[ii]), "\tin", box, only_centers) break if not check_if_textregion_located_in_a_box: dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_main[ii]], [cx_main[ii]]]), axis=0) @@ -2514,6 +2515,7 @@ class Eynollah: (boxes[:, 0] <= cx_main[ii]) & (cx_main[ii] < boxes[:, 1])) ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) arg_text_con_main[ii] = ind_min + #print("main/fallback", (mx_main[ii], Mx_main[ii], my_main[ii], My_main[ii]), "\tin", boxes[ind_min], only_centers) args_contours_main = np.arange(len(contours_only_text_parent)) order_by_con_main = np.zeros_like(arg_text_con_main) @@ -2531,6 +2533,7 @@ class Eynollah: My_head[ii] < box[3])): arg_text_con_head[ii] = jj check_if_textregion_located_in_a_box = True + #print("head/matched", (mx_head[ii], Mx_head[ii], my_head[ii], My_head[ii]), "\tin", box, only_centers) break if not check_if_textregion_located_in_a_box: dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_head[ii]], [cx_head[ii]]]), axis=0) @@ -2538,6 +2541,7 @@ class Eynollah: (boxes[:, 0] <= cx_head[ii]) & (cx_head[ii] < boxes[:, 1])) ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) arg_text_con_head[ii] = ind_min + #print("head/fallback", (mx_head[ii], Mx_head[ii], my_head[ii], My_head[ii]), "\tin", boxes[ind_min], only_centers) args_contours_head = np.arange(len(contours_only_text_parent_h)) order_by_con_head = np.zeros_like(arg_text_con_head) @@ -2587,7 +2591,7 @@ class Eynollah: try: results = match_boxes(False) except Exception as why: - self.logger.error(why) + self.logger.exception(why) results = match_boxes(True) self.logger.debug("exit do_order_of_regions") @@ -2976,7 +2980,7 @@ class Eynollah: max(self.num_col_lower or num_col_classifier, num_col_classifier)) except Exception as why: - self.logger.error(why) + self.logger.exception(why) num_col = None #print("inside graphics 3 ", time.time() - t_in_gr) return (num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, @@ -3044,7 +3048,7 @@ class Eynollah: if not num_column_is_classified: num_col_classifier = num_col + 1 except Exception as why: - self.logger.error(why) + self.logger.exception(why) num_col = None return (num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, text_regions_p_1, cont_page, table_prediction) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index b331cab..f1a8aae 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -33,226 +33,229 @@ def pairwise(iterable): a = b def return_x_start_end_mothers_childs_and_type_of_reading_order( - x_min_hor_some, x_max_hor_some, cy_hor_some, peak_points, cy_hor_diff): + x_min_hor_some, x_max_hor_some, cy_hor_some, peak_points, y_max_hor_some): + """ + Analyse which separators overlap multiple column candidates, + and how they overlap each other. + + Ignore separators not spanning multiple columns. + + For the separators to be returned, try to join them when they are directly + adjacent horizontally but nearby vertically (and thus mutually compatible). + Also, mark any separators that already span the full width. + + Furthermore, identify which pairs of (unjoined) separators span subsets of columns + of each other (disregarding vertical positions). Referring, respectively, to the + superset separators as "mothers" and to the subset separators as "children", + retrieve information on which columns are spanned by separators with no mother, + and which columns are spanned by their children (if any). + + Moreover, determine if there is any (column) overlap among the multi-span separators + with no mother, specifically (and thus, no simple box separation is possible). + + Arguments: + * the x start column index of the raw separators + * the x end column index of the raw separators + * the y center coordinate of the raw separators + * the x column coordinates + * the y end coordinate of the raw separators + + Returns: + a tuple of: + * whether any top-level (no-mother) multi-span separators overlap each other + * the x start column index of the resulting multi-span separators + * the x end column index of the resulting multi-span separators + * the y center coordinate of the resulting multi-span separators + * the y end coordinate of the resulting multi-span separators + * the y center (for 1 representative) of the top-level (no-mother) multi-span separators + * the x start column index of the top-level (no-mother) multi-span separators + * the x end column index of the top-level (no-mother) multi-span separators + * whether any multi-span separators have super-spans of other (child) multi-span separators + * the y center (for 1 representative) of the top-level (no-mother) multi-span separators + which have super-spans of other (child) multi-span separators + * the x start column index of the top-level multi-span separators + which have super-spans of other (child) multi-span separators + * the x end column index of the top-level multi-span separators + which have super-spans of other (child) multi-span separators + * indexes of multi-span separators with full-width span + """ x_start=[] x_end=[] - kind=[]#if covers 2 and more than 2 columns set it to 1 otherwise 0 len_sep=[] - y_sep=[] - y_diff=[] + y_mid=[] + y_max=[] new_main_sep_y=[] - indexer=0 for i in range(len(x_min_hor_some)): - starting=x_min_hor_some[i]-peak_points - starting=starting[starting>=0] - min_start=np.argmin(starting) - ending=peak_points-x_max_hor_some[i] - len_ending_neg=len(ending[ending<=0]) - - ending=ending[ending>0] - max_end=np.argmin(ending)+len_ending_neg + #print(indexer, "%d:%d" % (x_min_hor_some[i], x_max_hor_some[i]), cy_hor_some[i]) + starting = x_min_hor_some[i] - peak_points + min_start = np.flatnonzero(starting >= 0)[-1] # last left-of + ending = x_max_hor_some[i] - peak_points + max_end = np.flatnonzero(ending < 0)[0] # first right-of + #print(indexer, "%d:%d" % (min_start, max_end)) if (max_end-min_start)>=2: + # column range of separator spans more than one column candidate if (max_end-min_start)==(len(peak_points)-1): + # all columns (i.e. could be true new y splitter) new_main_sep_y.append(indexer) #print((max_end-min_start),len(peak_points),'(max_end-min_start)') - y_sep.append(cy_hor_some[i]) - y_diff.append(cy_hor_diff[i]) + y_mid.append(cy_hor_some[i]) + y_max.append(y_max_hor_some[i]) x_end.append(max_end) - - x_start.append( min_start) - + x_start.append(min_start) len_sep.append(max_end-min_start) - if max_end==min_start+1: - kind.append(0) - else: - kind.append(1) - indexer+=1 + #print(x_start,'x_start') + #print(x_end,'x_end') x_start_returned = np.array(x_start, dtype=int) x_end_returned = np.array(x_end, dtype=int) - y_sep_returned = np.array(y_sep, dtype=int) - y_diff_returned = np.array(y_diff, dtype=int) - - all_args_uniq = contours_in_same_horizon(y_sep_returned) - args_to_be_unified=[] - y_unified=[] - y_diff_unified=[] - x_s_unified=[] - x_e_unified=[] - if len(all_args_uniq)>0: - #print('burda') - if type(all_args_uniq[0]) is list: - for dd in range(len(all_args_uniq)): - if len(all_args_uniq[dd])==2: - x_s_same_hor=np.array(x_start_returned)[all_args_uniq[dd]] - x_e_same_hor=np.array(x_end_returned)[all_args_uniq[dd]] - y_sep_same_hor=np.array(y_sep_returned)[all_args_uniq[dd]] - y_diff_same_hor=np.array(y_diff_returned)[all_args_uniq[dd]] - #print('burda2') - if (x_s_same_hor[0]==x_e_same_hor[1]-1 or - x_s_same_hor[1]==x_e_same_hor[0]-1 and - x_s_same_hor[0]!=x_s_same_hor[1] and - x_e_same_hor[0]!=x_e_same_hor[1]): - #print('burda3') - for arg_in in all_args_uniq[dd]: - #print(arg_in,'arg_in') - args_to_be_unified.append(arg_in) - y_selected=np.min(y_sep_same_hor) - y_diff_selected=np.max(y_diff_same_hor) - x_s_selected=np.min(x_s_same_hor) - x_e_selected=np.max(x_e_same_hor) - - x_s_unified.append(x_s_selected) - x_e_unified.append(x_e_selected) - y_unified.append(y_selected) - y_diff_unified.append(y_diff_selected) - #print(x_s_same_hor,'x_s_same_hor') - #print(x_e_same_hor[:]-1,'x_e_same_hor') - #print('#############################') - #print(x_s_unified,'y_selected') - #print(x_e_unified,'x_s_selected') - #print(y_unified,'x_e_same_hor') - - args_lines_not_unified=list( set(range(len(y_sep_returned)))-set(args_to_be_unified) ) - #print(args_lines_not_unified,'args_lines_not_unified') - - x_start_returned_not_unified=list( np.array(x_start_returned)[args_lines_not_unified] ) - x_end_returned_not_unified=list( np.array(x_end_returned)[args_lines_not_unified] ) - y_sep_returned_not_unified=list (np.array(y_sep_returned)[args_lines_not_unified] ) - y_diff_returned_not_unified=list (np.array(y_diff_returned)[args_lines_not_unified] ) - - for dv in range(len(y_unified)): - y_sep_returned_not_unified.append(y_unified[dv]) - y_diff_returned_not_unified.append(y_diff_unified[dv]) - x_start_returned_not_unified.append(x_s_unified[dv]) - x_end_returned_not_unified.append(x_e_unified[dv]) - - #print(y_sep_returned,'y_sep_returned') + y_mid_returned = np.array(y_mid, dtype=int) + y_max_returned = np.array(y_max, dtype=int) + #print(y_mid_returned,'y_mid_returned') #print(x_start_returned,'x_start_returned') #print(x_end_returned,'x_end_returned') - x_start_returned = np.array(x_start_returned_not_unified, dtype=int) - x_end_returned = np.array(x_end_returned_not_unified, dtype=int) - y_sep_returned = np.array(y_sep_returned_not_unified, dtype=int) - y_diff_returned = np.array(y_diff_returned_not_unified, dtype=int) + # join/elongate separators if follow-up x and similar y + sep_pairs = contours_in_same_horizon(y_mid_returned) + if len(sep_pairs): + #print('burda') + args_to_be_unified = set() + y_mid_unified = [] + y_max_unified = [] + x_start_unified = [] + x_end_unified = [] + for pair in sep_pairs: + if (not np.array_equal(*x_start_returned[pair]) and + not np.array_equal(*x_end_returned[pair]) and + # immediately adjacent columns? + np.diff(x_end_returned[pair] - + x_start_returned[pair])[0] in [1, -1]): - #print(y_sep_returned,'y_sep_returned2') + args_to_be_unified.union(set(pair)) + y_mid_unified.append(np.min(y_mid_returned[pair])) + y_max_unified.append(np.max(y_max_returned[pair])) + x_start_unified.append(np.min(x_start_returned[pair])) + x_end_unified.append(np.max(x_end_returned[pair])) + #print(pair,'pair') + #print(x_start_returned[pair],'x_s_same_hor') + #print(x_end_returned[pair],'x_e_same_hor') + #print(y_mid_unified,'y_mid_unified') + #print(y_max_unified,'y_max_unified') + #print(x_start_unified,'x_s_unified') + #print(x_end_unified,'x_e_selected') + #print('#############################') + + if len(y_mid_unified): + args_lines_not_unified = np.setdiff1d(np.arange(len(y_mid_returned)), + list(args_to_be_unified), assume_unique=True) + #print(args_lines_not_unified,'args_lines_not_unified') + x_start_returned = np.append(x_start_returned[args_lines_not_unified], + x_start_unified, axis=0) + x_end_returned = np.append(x_end_returned[args_lines_not_unified], + x_end_unified, axis=0) + y_mid_returned = np.append(y_mid_returned[args_lines_not_unified], + y_mid_unified, axis=0) + y_max_returned = np.append(y_max_returned[args_lines_not_unified], + y_max_unified, axis=0) + #print(y_mid_returned,'y_mid_returned2') #print(x_start_returned,'x_start_returned2') #print(x_end_returned,'x_end_returned2') - #print(new_main_sep_y,'new_main_sep_y') + #print(new_main_sep_y,'new_main_sep_y') #print(x_start,'x_start') #print(x_end,'x_end') - if len(new_main_sep_y)>0: + x_start = np.array(x_start) + x_end = np.array(x_end) + y_mid = np.array(y_mid) + if len(new_main_sep_y): + # some full-width multi-span separators exist, so + # restrict the y range of separators to search for + # mutual overlaps to only those within the largest + # y strip between adjacent multi-span separators + # that involve at least one such full-width seps. + # (does not affect the separators to be returned) + min_ys=np.min(y_mid) + max_ys=np.max(y_mid) + #print(min_ys,'min_ys') + #print(max_ys,'max_ys') - min_ys=np.min(y_sep) - max_ys=np.max(y_sep) + y_mains0 = list(y_mid[new_main_sep_y]) + y_mains = [min_ys] + y_mains0 + [max_ys] - y_mains=[] - y_mains.append(min_ys) - y_mains_sep_ohne_grenzen=[] + y_mains = np.sort(y_mains) + argm = np.argmax(np.diff(y_mains)) + y_mid_new = y_mains[argm] + y_mid_next_new = y_mains[argm + 1] - for ii in range(len(new_main_sep_y)): - y_mains.append(y_sep[new_main_sep_y[ii]]) - y_mains_sep_ohne_grenzen.append(y_sep[new_main_sep_y[ii]]) - - y_mains.append(max_ys) - - y_mains_sorted=np.sort(y_mains) - diff=np.diff(y_mains_sorted) - argm=np.argmax(diff) - - y_min_new=y_mains_sorted[argm] - y_max_new=y_mains_sorted[argm+1] - - #print(y_min_new,'y_min_new') - #print(y_max_new,'y_max_new') - #print(y_sep[new_main_sep_y[0]],y_sep,'yseps') + #print(y_mid_new,argm,'y_mid_new') + #print(y_mid_next_new,argm+1,'y_mid_next_new') + #print(y_mid[new_main_sep_y],new_main_sep_y,'yseps') x_start=np.array(x_start) x_end=np.array(x_end) - kind=np.array(kind) - y_sep=np.array(y_sep) - if (y_min_new in y_mains_sep_ohne_grenzen and - y_max_new in y_mains_sep_ohne_grenzen): - x_start=x_start[(y_sep>y_min_new) & (y_sepy_min_new) & (y_sepy_min_new) & (y_sepy_min_new) & (y_sepy_min_new) & (y_sep<=y_max_new)] - #print('burda1') - x_end=x_end[(y_sep>y_min_new) & (y_sep<=y_max_new)] - #print('burda2') - kind=kind[(y_sep>y_min_new) & (y_sep<=y_max_new)] - y_sep=y_sep[(y_sep>y_min_new) & (y_sep<=y_max_new)] - elif (y_min_new not in y_mains_sep_ohne_grenzen and - y_max_new in y_mains_sep_ohne_grenzen): - x_start=x_start[(y_sep>=y_min_new) & (y_sep=y_min_new) & (y_sep=y_min_new) & (y_sep=y_min_new) & (y_sep y_mid_new else: - x_start=x_start[(y_sep>=y_min_new) & (y_sep<=y_max_new)] - x_end=x_end[(y_sep>=y_min_new) & (y_sep<=y_max_new)] - kind=kind[(y_sep>=y_min_new) & (y_sep<=y_max_new)] - y_sep=y_sep[(y_sep>=y_min_new) & (y_sep<=y_max_new)] + where = y_mid >= y_mid_new + if y_mid_next_new in y_mains0: + where &= y_mid < y_mid_next_new + else: + where &= y_mid <= y_mid_next_new + x_start = x_start[where] + x_end = x_end[where] + y_mid = y_mid[where] #print(x_start,'x_start') #print(x_end,'x_end') - #print(len_sep) + # remove redundant separators that span the same columns + # (keeping only 1 representative each) deleted = set() - for i in range(len(x_start)-1): - nodes_i=set(range(x_start[i],x_end[i]+1)) - for j in range(i+1,len(x_start)): - if nodes_i==set(range(x_start[j],x_end[j]+1)): - deleted.add(j) - #print(np.unique(deleted)) - + for index_i in range(len(x_start) - 1): + nodes_i = set(range(x_start[index_i], x_end[index_i] + 1)) + #print(nodes_i, "nodes_i") + for index_j in range(index_i + 1, len(x_start)): + nodes_j = set(range(x_start[index_j], x_end[index_j] + 1)) + #print(nodes_j, "nodes_j") + if nodes_i == nodes_j: + deleted.add(index_j) + #print(deleted,"deleted") remained_sep_indexes = set(range(len(x_start))) - deleted #print(remained_sep_indexes,'remained_sep_indexes') - mother=[]#if it has mother - child=[] + + # determine which separators span which columns + mother = [] # whether the respective separator has a mother separator + child = [] # whether the respective separator has a child separator for index_i in remained_sep_indexes: have_mother=0 have_child=0 - nodes_ind=set(range(x_start[index_i],x_end[index_i]+1)) + nodes_i = set(range(x_start[index_i], x_end[index_i] + 1)) for index_j in remained_sep_indexes: - nodes_ind_j=set(range(x_start[index_j],x_end[index_j]+1)) - if nodes_indnodes_ind_j: + if nodes_i > nodes_j: have_child=1 mother.append(have_mother) child.append(have_child) - - #print(mother,'mother') - #print(len(remained_sep_indexes)) - #print(len(remained_sep_indexes),len(x_start),len(x_end),len(y_sep),'lens') - y_lines_without_mother=[] - x_start_without_mother=[] - x_end_without_mother=[] - - y_lines_with_child_without_mother=[] - x_start_with_child_without_mother=[] - x_end_with_child_without_mother=[] + #print(mother, "mother") + #print(child, "child") mother = np.array(mother) child = np.array(child) #print(mother,'mother') #print(child,'child') remained_sep_indexes = np.array(list(remained_sep_indexes)) - x_start = np.array(x_start) - x_end = np.array(x_end) - y_sep = np.array(y_sep) + #print(len(remained_sep_indexes)) + #print(len(remained_sep_indexes),len(x_start),len(x_end),len(y_mid),'lens') - if len(remained_sep_indexes)>1: + reading_order_type = 0 + if len(remained_sep_indexes): #print(np.array(remained_sep_indexes),'np.array(remained_sep_indexes)') #print(np.array(mother),'mother') remained_sep_indexes_without_mother = remained_sep_indexes[mother==0] @@ -262,52 +265,53 @@ def return_x_start_end_mothers_childs_and_type_of_reading_order( x_end_with_child_without_mother = x_end[remained_sep_indexes_with_child_without_mother] x_start_with_child_without_mother = x_start[remained_sep_indexes_with_child_without_mother] - y_lines_with_child_without_mother = y_sep[remained_sep_indexes_with_child_without_mother] + y_mid_with_child_without_mother = y_mid[remained_sep_indexes_with_child_without_mother] - reading_order_type=0 x_end_without_mother = x_end[remained_sep_indexes_without_mother] x_start_without_mother = x_start[remained_sep_indexes_without_mother] - y_lines_without_mother = y_sep[remained_sep_indexes_without_mother] + y_mid_without_mother = y_mid[remained_sep_indexes_without_mother] if len(remained_sep_indexes_without_mother)>=2: for i in range(len(remained_sep_indexes_without_mother)-1): - nodes_i=set(range(x_start[remained_sep_indexes_without_mother[i]], - x_end[remained_sep_indexes_without_mother[i]] - # + 1 - )) - for j in range(i+1,len(remained_sep_indexes_without_mother)): - nodes_j=set(range(x_start[remained_sep_indexes_without_mother[j]], - x_end[remained_sep_indexes_without_mother[j]] - # + 1 - )) + index_i = remained_sep_indexes_without_mother[i] + nodes_i = set(range(x_start[index_i], x_end[index_i])) # + 1 + #print(index_i, nodes_i, "nodes_i without mother") + for j in range(i + 1, len(remained_sep_indexes_without_mother)): + index_j = remained_sep_indexes_without_mother[j] + nodes_j = set(range(x_start[index_j], x_end[index_j])) # + 1 + #print(index_j, nodes_j, "nodes_j without mother") if nodes_i - nodes_j != nodes_i: + #print("type=1") reading_order_type = 1 else: - reading_order_type = 0 - #print(reading_order_type,'javab') - #print(y_lines_with_child_without_mother,'y_lines_with_child_without_mother') + y_mid_without_mother = np.zeros(0, int) + x_start_without_mother = np.zeros(0, int) + x_end_without_mother = np.zeros(0, int) + y_mid_with_child_without_mother = np.zeros(0, int) + x_start_with_child_without_mother = np.zeros(0, int) + x_end_with_child_without_mother = np.zeros(0, int) + + #print(reading_order_type,'reading_order_type') + #print(y_mid_with_child_without_mother,'y_mid_with_child_without_mother') #print(x_start_with_child_without_mother,'x_start_with_child_without_mother') #print(x_end_with_child_without_mother,'x_end_with_hild_without_mother') len_sep_with_child = len(child[child==1]) - #print(len_sep_with_child,'len_sep_with_child') there_is_sep_with_child = 0 if len_sep_with_child >= 1: there_is_sep_with_child = 1 - #print(all_args_uniq,'all_args_uniq') - #print(args_to_be_unified,'args_to_be_unified') return (reading_order_type, x_start_returned, x_end_returned, - y_sep_returned, - y_diff_returned, - y_lines_without_mother, + y_mid_returned, + y_max_returned, + y_mid_without_mother, x_start_without_mother, x_end_without_mother, there_is_sep_with_child, - y_lines_with_child_without_mother, + y_mid_with_child_without_mother, x_start_with_child_without_mother, x_end_with_child_without_mother, new_main_sep_y) From b2a79cc6ed766cef5074629fcb76ae1c6846f084 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 24 Oct 2025 01:31:52 +0200 Subject: [PATCH 12/91] `return_x_start_end_mothers_childs_and_type_of_reading_order`: fix+1 when calculating `reading_order_type`, upper limit on column range (`x_end`) needs to be `+1` here as well --- src/eynollah/utils/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index f1a8aae..3a383e9 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -274,11 +274,11 @@ def return_x_start_end_mothers_childs_and_type_of_reading_order( if len(remained_sep_indexes_without_mother)>=2: for i in range(len(remained_sep_indexes_without_mother)-1): index_i = remained_sep_indexes_without_mother[i] - nodes_i = set(range(x_start[index_i], x_end[index_i])) # + 1 + nodes_i = set(range(x_start[index_i], x_end[index_i] + 1)) #print(index_i, nodes_i, "nodes_i without mother") for j in range(i + 1, len(remained_sep_indexes_without_mother)): index_j = remained_sep_indexes_without_mother[j] - nodes_j = set(range(x_start[index_j], x_end[index_j])) # + 1 + nodes_j = set(range(x_start[index_j], x_end[index_j] + 1)) #print(index_j, nodes_j, "nodes_j without mother") if nodes_i - nodes_j != nodes_i: #print("type=1") From acee4c1bfe227055194050935f1868d1fb156701 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 24 Oct 2025 01:43:41 +0200 Subject: [PATCH 13/91] `find_number_of_columns_in_document`: simplify --- src/eynollah/utils/__init__.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 3a383e9..f948de2 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1551,23 +1551,23 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, (x_max_head>=.84*region_pre_p.shape[1])] cy_seps_splitters = np.append(cy_seps_splitters, cy_seps_splitters_head) - cy_seps_splitters = np.sort(cy_seps_splitters) + cy_seps_splitters = np.sort(cy_seps_splitters).astype(int) splitter_y_new = [0] + list(cy_seps_splitters) + [region_pre_p.shape[0]] - splitter_y_new_diff = np.diff(splitter_y_new) / float(region_pre_p.shape[0]) * 100 - - args_big_parts=np.arange(len(splitter_y_new_diff))[ splitter_y_new_diff>22 ] + big_part = 22 * region_pre_p.shape[0] // 100 # percent height regions_without_separators=return_regions_without_separators(region_pre_p) - length_y_threshold=regions_without_separators.shape[0]/4.0 num_col_fin=0 peaks_neg_fin_fin=[] - for itiles in args_big_parts: - regions_without_separators_tile=regions_without_separators[int(splitter_y_new[itiles]): - int(splitter_y_new[itiles+1]),:] + num_big_parts = 0 + for top, bot in pairwise(splitter_y_new): + if bot - top < big_part: + continue + num_big_parts += 1 try: - num_col, peaks_neg_fin = find_num_col(regions_without_separators_tile, + num_col, peaks_neg_fin = find_num_col(regions_without_separators[top: bot], num_col_classifier, tables, multiplier=7.0) + #print("big part %d:%d has %d columns" % (top, bot, num_col), peaks_neg_fin) except: num_col = 0 peaks_neg_fin = [] @@ -1575,7 +1575,7 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, num_col_fin=num_col peaks_neg_fin_fin=peaks_neg_fin - if len(args_big_parts)==1 and (len(peaks_neg_fin_fin)+1)=500] peaks_neg_fin=peaks_neg_fin[peaks_neg_fin<=(vertical.shape[1]-500)] From 5d15941b350841a4490e002c92ff89a5f6113905 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 24 Oct 2025 01:51:59 +0200 Subject: [PATCH 14/91] `contours_in_same_horizon`: simplify - array instead of list operations - return array of index pairs instead of list objects --- src/eynollah/utils/__init__.py | 73 ++++++++++++++++------------------ src/eynollah/utils/contour.py | 25 +++++------- 2 files changed, 44 insertions(+), 54 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index f948de2..10987ad 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1315,47 +1315,42 @@ def combine_hor_lines_and_delete_cross_points_and_get_lines_features_back_new( float(num_col_classifier)) if len_lines_bigger_than_x_width_smaller_than_acolumn_width_per_column < 10: args_hor=np.arange(len(slope_lines_hor)) - all_args_uniq=contours_in_same_horizon(cy_main_hor) - #print(all_args_uniq,'all_args_uniq') - if len(all_args_uniq)>0: - if type(all_args_uniq[0]) is list: - special_separators=[] - contours_new=[] - for dd in range(len(all_args_uniq)): - merged_all=None - some_args=args_hor[all_args_uniq[dd]] - some_cy=cy_main_hor[all_args_uniq[dd]] - some_x_min=x_min_main_hor[all_args_uniq[dd]] - some_x_max=x_max_main_hor[all_args_uniq[dd]] + sep_pairs=contours_in_same_horizon(cy_main_hor) + if len(sep_pairs): + special_separators=[] + contours_new=[] + for pair in sep_pairs: + merged_all=None + some_args=args_hor[pair] + some_cy=cy_main_hor[pair] + some_x_min=x_min_main_hor[pair] + some_x_max=x_max_main_hor[pair] - #img_in=np.zeros(separators_closeup_n[:,:,2].shape) - #print(img_p_in_ver.shape[1],some_x_max-some_x_min,'xdiff') - diff_x_some=some_x_max-some_x_min - for jv in range(len(some_args)): - img_p_in=cv2.fillPoly(img_in_hor, pts=[contours_lines_hor[some_args[jv]]], color=(1,1,1)) - if any(i_diff>(img_p_in_ver.shape[1]/float(3.3)) for i_diff in diff_x_some): - img_p_in[int(np.mean(some_cy))-5: - int(np.mean(some_cy))+5, - int(np.min(some_x_min)): - int(np.max(some_x_max)) ]=1 - sum_dis=dist_x_hor[some_args].sum() - diff_max_min_uniques=np.max(x_max_main_hor[some_args])-np.min(x_min_main_hor[some_args]) + #img_in=np.zeros(separators_closeup_n[:,:,2].shape) + #print(img_p_in_ver.shape[1],some_x_max-some_x_min,'xdiff') + diff_x_some=some_x_max-some_x_min + for jv in range(len(some_args)): + img_p_in=cv2.fillPoly(img_in_hor, pts=[contours_lines_hor[some_args[jv]]], color=(1,1,1)) + if any(i_diff>(img_p_in_ver.shape[1]/float(3.3)) for i_diff in diff_x_some): + img_p_in[int(np.mean(some_cy))-5: + int(np.mean(some_cy))+5, + int(np.min(some_x_min)): + int(np.max(some_x_max)) ]=1 + sum_dis=dist_x_hor[some_args].sum() + diff_max_min_uniques=np.max(x_max_main_hor[some_args])-np.min(x_min_main_hor[some_args]) - if (diff_max_min_uniques > sum_dis and - sum_dis / float(diff_max_min_uniques) > 0.85 and - diff_max_min_uniques / float(img_p_in_ver.shape[1]) > 0.85 and - np.std(dist_x_hor[some_args]) < 0.55 * np.mean(dist_x_hor[some_args])): - # print(dist_x_hor[some_args], - # dist_x_hor[some_args].sum(), - # np.min(x_min_main_hor[some_args]), - # np.max(x_max_main_hor[some_args]),'jalibdi') - # print(np.mean( dist_x_hor[some_args] ), - # np.std( dist_x_hor[some_args] ), - # np.var( dist_x_hor[some_args] ),'jalibdiha') - special_separators.append(np.mean(cy_main_hor[some_args])) - else: - img_p_in=img_in_hor - special_separators=[] + if (diff_max_min_uniques > sum_dis and + sum_dis / float(diff_max_min_uniques) > 0.85 and + diff_max_min_uniques / float(img_p_in_ver.shape[1]) > 0.85 and + np.std(dist_x_hor[some_args]) < 0.55 * np.mean(dist_x_hor[some_args])): + # print(dist_x_hor[some_args], + # dist_x_hor[some_args].sum(), + # np.min(x_min_main_hor[some_args]), + # np.max(x_max_main_hor[some_args]),'jalibdi') + # print(np.mean( dist_x_hor[some_args] ), + # np.std( dist_x_hor[some_args] ), + # np.var( dist_x_hor[some_args] ),'jalibdiha') + special_separators.append(np.mean(cy_main_hor[some_args])) else: img_p_in=img_in_hor special_separators=[] diff --git a/src/eynollah/utils/contour.py b/src/eynollah/utils/contour.py index f304db2..052688c 100644 --- a/src/eynollah/utils/contour.py +++ b/src/eynollah/utils/contour.py @@ -14,21 +14,16 @@ from shapely.ops import unary_union, nearest_points from .rotate import rotate_image, rotation_image_new def contours_in_same_horizon(cy_main_hor): - X1 = np.zeros((len(cy_main_hor), len(cy_main_hor))) - X2 = np.zeros((len(cy_main_hor), len(cy_main_hor))) - - X1[0::1, :] = cy_main_hor[:] - X2 = X1.T - - X_dif = np.abs(X2 - X1) - args_help = np.array(range(len(cy_main_hor))) - all_args = [] - for i in range(len(cy_main_hor)): - list_h = list(args_help[X_dif[i, :] <= 20]) - list_h.append(i) - if len(list_h) > 1: - all_args.append(list(set(list_h))) - return np.unique(np.array(all_args, dtype=object)) + """ + Takes an array of y coords, identifies all pairs among them + which are close to each other, and returns all such pairs + by index into the array. + """ + sort = np.argsort(cy_main_hor) + same = np.diff(cy_main_hor[sort] <= 20) + # groups = np.split(sort, np.arange(len(cy_main_hor) - 1)[~same] + 1) + same = np.flatnonzero(same) + return np.stack((sort[:-1][same], sort[1:][same])).T def find_contours_mean_y_diff(contours_main): M_main = [cv2.moments(contours_main[j]) for j in range(len(contours_main))] From 6cc5900943d5395adbbbea737871413bf10b9ccf Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 24 Oct 2025 01:55:07 +0200 Subject: [PATCH 15/91] `find_num_col`: add better plotting (but commented out) --- src/eynollah/utils/__init__.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 10987ad..4046396 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -485,9 +485,12 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl # print(peaks_neg,'peaks_neg') # fig, (ax1, ax2) = plt.subplots(2, sharex=True) # ax1.imshow(regions_without_separators, aspect="auto") - # ax2.plot(z) - # ax2.scatter(peaks_neg, z[peaks_neg]) - # ax2.axhline(grenze, label="grenze") + # ax2.plot(z, color='red', label='z') + # ax2.plot(zneg[20:], color='blue', label='zneg') + # ax2.scatter(peaks_neg, z[peaks_neg], color='red') + # ax2.scatter(peaks_neg, zneg[20:][peaks_neg], color='blue') + # ax2.axhline(min_peaks_pos, color='red', label="min_peaks_pos") + # ax2.axhline(grenze, color='blue', label="grenze") # ax2.text(0, grenze, "grenze") # plt.show() @@ -816,6 +819,12 @@ def find_num_col_by_vertical_lines(regions_without_separators, multiplier=3.8): peaks, _ = find_peaks(z, height=0) # print(peaks,'peaksnew') + # fig, (ax1, ax2) = plt.subplots(2, sharex=True, suptitle='find_num_col_by_vertical_lines') + # ax1.imshow(regions_without_separators, aspect="auto") + # ax2.plot(z) + # ax2.scatter(peaks, z[peaks]) + # ax2.set_title('find_peaks(regions_without_separators.sum(axis=0), height=0)') + # plt.show() return peaks def return_regions_without_separators(regions_pre): From 6fbb5f8a12185192f7d9db7b008c3ef8b5f24d33 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 24 Oct 2025 02:02:39 +0200 Subject: [PATCH 16/91] `return_boxes_of_images_by_order_of_reading_new`: simplify MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - array instead of list operations - add better plotting (but commented out) - add more debug printing (but commented out) - add more inline comments for documentation - rename identifiers to make more readable: - `cy_hor_diff` → `y_max_hor_some` (because the ymax gets passed) - `lines` → `seps` - `y_type_2` → `y_mid` - `y_diff_type_2` → `y_max` - `y_lines_by_order` → `y_mid_by_order` - `y_lines_without_mother` → `y_mid_without_mother` - `y_lines_with_child_without_mother` → `y_mid_with_child_without_mother` - `y_column` → `y_mid_column` - `y_column_nc` → `y_mid_column_nc` - `y_all_between_nm_wc` → `y_mid_between_nm_wc` - `lines_so_close_to_top_separator` → `seps_too_close_to_top_separator` - `y_in_cols` and `y_down` → `y_mid_next` - use `pairwise()` `nc_top:nc_bot` instead of `i_c` indexing --- src/eynollah/utils/__init__.py | 480 +++++++++++++++++---------------- 1 file changed, 247 insertions(+), 233 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 4046396..eca96f3 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1599,19 +1599,31 @@ def return_boxes_of_images_by_order_of_reading_new( if logger is None: logger = getLogger(__package__) logger.debug('enter return_boxes_of_images_by_order_of_reading_new') - # def dbg_plt(box=None, title=None): - # if box is None: - # box = [None, None, None, None] - # img = regions_without_separators[box[2]:box[3], box[0]:box[1]] + + # def dbg_plt(box=None, title=None, rectangles=None, rectangles_showidx=False): + # minx, maxx, miny, maxy = box or (0, None, 0, None) + # img = regions_without_separators[miny:maxy, minx:maxx] # plt.imshow(img) # xrange = np.arange(0, img.shape[1], 100) # yrange = np.arange(0, img.shape[0], 100) - # plt.gca().set_xticks(xrange, xrange + (box[0] or 0)) - # plt.gca().set_yticks(yrange, yrange + (box[2] or 0)) + # ax = plt.gca() + # ax.set_xticks(xrange) + # ax.set_yticks(yrange) + # ax.set_xticklabels(xrange + minx) + # ax.set_yticklabels(yrange + miny) + # def format_coord(x, y): + # return 'x={:g}, y={:g}'.format(x + minx, y + miny) + # ax.format_coord = format_coord # if title: # plt.title(title) + # if rectangles: + # for i, (xmin, xmax, ymin, ymax) in enumerate(rectangles): + # ax.add_patch(patches.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, + # fill=False, linewidth=1, edgecolor='r')) + # if rectangles_showidx: + # ax.text((xmin+xmax)/2, (ymin+ymax)/2, str(i + 1), c='r') # plt.show() - # dbg_plt() + # dbg_plt(title="return_boxes_of_images_by_order_of_reading_new") boxes=[] peaks_neg_tot_tables = [] @@ -1619,9 +1631,7 @@ def return_boxes_of_images_by_order_of_reading_new( width_tot = regions_without_separators.shape[1] for top, bot in pairwise(splitter_y_new): # print("%d:%d" % (top, bot), 'i') - # dbg_plt([None, None, top, bot], - # "image cut for y split %d:%d" % ( - # top, bot)) + # dbg_plt([0, None, top, bot], "image cut for y split %d:%d" % (top, bot)) matrix_new = matrix_of_lines_ch[(matrix_of_lines_ch[:,6] > top) & (matrix_of_lines_ch[:,7] < bot)] #print(len( matrix_new[:,9][matrix_new[:,9]==1] )) @@ -1677,20 +1687,21 @@ def return_boxes_of_images_by_order_of_reading_new( peaks_neg_fin = peaks_neg_fin1 else: peaks_neg_fin = peaks_neg_fin2 + # add offset to local result peaks_neg_fin = list(np.array(peaks_neg_fin) + left) #print(peaks_neg_fin,'peaks_neg_fin') - if right < peaks_neg_fin_early[-1]: - peaks_neg_fin_rev.append(right) peaks_neg_fin_rev.extend(peaks_neg_fin) + if right < peaks_neg_fin_early[-1]: + # all but the last column: interject the preexisting boundary + peaks_neg_fin_rev.append(right) + #print(peaks_neg_fin_rev,'peaks_neg_fin_rev') - if len(peaks_neg_fin_rev)>=len(peaks_neg_fin_org): - peaks_neg_fin=list(np.sort(peaks_neg_fin_rev)) - num_col=len(peaks_neg_fin) + if len(peaks_neg_fin_rev) >= len(peaks_neg_fin_org): + peaks_neg_fin = peaks_neg_fin_rev else: - peaks_neg_fin=list(np.copy(peaks_neg_fin_org)) - num_col=len(peaks_neg_fin) - + peaks_neg_fin = peaks_neg_fin_org + num_col = len(peaks_neg_fin) #print(peaks_neg_fin,'peaks_neg_fin') except: logger.exception("cannot find peaks consistent with columns") @@ -1700,7 +1711,7 @@ def return_boxes_of_images_by_order_of_reading_new( x_min_hor_some=matrix_new[:,2][ (matrix_new[:,9]==0) ] x_max_hor_some=matrix_new[:,3][ (matrix_new[:,9]==0) ] cy_hor_some=matrix_new[:,5][ (matrix_new[:,9]==0) ] - cy_hor_diff=matrix_new[:,7][ (matrix_new[:,9]==0) ] + y_max_hor_some=matrix_new[:,7][ (matrix_new[:,9]==0) ] if right2left_readingorder: x_max_hor_some_new = width_tot - x_min_hor_some @@ -1708,136 +1719,121 @@ def return_boxes_of_images_by_order_of_reading_new( x_min_hor_some =list(np.copy(x_min_hor_some_new)) x_max_hor_some =list(np.copy(x_max_hor_some_new)) - peaks_neg_tot = [0] + peaks_neg_fin + [width_tot] + peaks_neg_tot = np.array([0] + peaks_neg_fin + [width_tot]) + #print(peaks_neg_tot,'peaks_neg_tot') peaks_neg_tot_tables.append(peaks_neg_tot) - reading_order_type, x_starting, x_ending, y_type_2, y_diff_type_2, \ - y_lines_without_mother, x_start_without_mother, x_end_without_mother, there_is_sep_with_child, \ - y_lines_with_child_without_mother, x_start_with_child_without_mother, x_end_with_child_without_mother, \ - new_main_sep_y = return_x_start_end_mothers_childs_and_type_of_reading_order( - x_min_hor_some, x_max_hor_some, cy_hor_some, peaks_neg_tot, cy_hor_diff) - all_columns = set(range(len(peaks_neg_tot) - 1)) - # print("all_columns", all_columns) + #print("all_columns", all_columns) + + reading_order_type, x_starting, x_ending, y_mid, y_max, \ + y_mid_without_mother, x_start_without_mother, x_end_without_mother, \ + there_is_sep_with_child, \ + y_mid_with_child_without_mother, x_start_with_child_without_mother, x_end_with_child_without_mother, \ + new_main_sep_y = return_x_start_end_mothers_childs_and_type_of_reading_order( + x_min_hor_some, x_max_hor_some, cy_hor_some, peaks_neg_tot, y_max_hor_some) + + # show multi-column separators + # dbg_plt([0, None, top, bot], "multi-column separators in current split", + # list(zip(peaks_neg_tot[x_starting], peaks_neg_tot[x_ending], + # y_mid - top, y_max - top)), True) + if (reading_order_type == 1 or - len(y_lines_without_mother) >= 2 or + len(y_mid_without_mother) >= 2 or there_is_sep_with_child == 1): + # there are top-level multi-colspan horizontal separators which overlap each other + # or multiple top-level multi-colspan horizontal separators + # or multi-colspan horizontal separators shorter than their respective top-level: + # todo: explain how this is dealt with try: y_grenze = top + 300 - #check if there is a big separator in this y_mains_sep_ohne_grenzen + up = (y_mid > top) & (y_mid <= y_grenze) - args_early_ys=np.arange(len(y_type_2)) + args_early_ys=np.arange(len(y_mid)) #print(args_early_ys,'args_early_ys') - #print(top, bot) + #print(y_mid,'y_mid') - x_starting_up = x_starting[(y_type_2 > top) & - (y_type_2 <= y_grenze)] - x_ending_up = x_ending[(y_type_2 > top) & - (y_type_2 <= y_grenze)] - y_type_2_up = y_type_2[(y_type_2 > top) & - (y_type_2 <= y_grenze)] - y_diff_type_2_up = y_diff_type_2[(y_type_2 > top) & - (y_type_2 <= y_grenze)] - args_up = args_early_ys[(y_type_2 > top) & - (y_type_2 <= y_grenze)] - if len(y_type_2_up) > 0: - y_main_separator_up = y_type_2_up [(x_starting_up==0) & - (x_ending_up==(len(peaks_neg_tot)-1) )] - y_diff_main_separator_up = y_diff_type_2_up[(x_starting_up==0) & - (x_ending_up==(len(peaks_neg_tot)-1) )] - args_main_to_deleted = args_up[(x_starting_up==0) & - (x_ending_up==(len(peaks_neg_tot)-1) )] - #print(y_main_separator_up,y_diff_main_separator_up,args_main_to_deleted,'fffffjammmm') - if len(y_diff_main_separator_up) > 0: + x_starting_up = x_starting[up] + x_ending_up = x_ending[up] + y_mid_up = y_mid[up] + y_max_up = y_max[up] + args_up = args_early_ys[up] + #print(args_up,'args_up') + #print(y_mid_up,'y_mid_up') + #check if there is a big separator in this y_mains0 + if len(y_mid_up) > 0: + # is there a separator with full-width span? + main_separator = (x_starting_up == 0) & (x_ending_up == len(peaks_neg_tot) - 1) + y_mid_main_separator_up = y_mid_up[main_separator] + y_max_main_separator_up = y_max_up[main_separator] + args_main_to_deleted = args_up[main_separator] + #print(y_mid_main_separator_up,y_max_main_separator_up,args_main_to_deleted,'fffffjammmm') + if len(y_max_main_separator_up): args_to_be_kept = np.array(list( set(args_early_ys) - set(args_main_to_deleted) )) #print(args_to_be_kept,'args_to_be_kept') - boxes.append([0, peaks_neg_tot[len(peaks_neg_tot)-1], - top, y_diff_main_separator_up.max()]) - # dbg_plt(boxes[-1], "first box") - top = y_diff_main_separator_up.max() + boxes.append([0, peaks_neg_tot[-1], + top, y_max_main_separator_up.max()]) + # dbg_plt(boxes[-1], "near top main separator box") + top = y_max_main_separator_up.max() #print(top,'top') - y_type_2 = y_type_2[args_to_be_kept] + y_mid = y_mid[args_to_be_kept] x_starting = x_starting[args_to_be_kept] x_ending = x_ending[args_to_be_kept] - y_diff_type_2 = y_diff_type_2[args_to_be_kept] + y_max = y_max[args_to_be_kept] #print('galdiha') y_grenze = top + 200 - args_early_ys2=np.arange(len(y_type_2)) - y_type_2_up=y_type_2[(y_type_2 > top) & - (y_type_2 <= y_grenze)] - x_starting_up=x_starting[(y_type_2 > top) & - (y_type_2 <= y_grenze)] - x_ending_up=x_ending[(y_type_2 > top) & - (y_type_2 <= y_grenze)] - y_diff_type_2_up=y_diff_type_2[(y_type_2 > top) & - (y_type_2 <= y_grenze)] - args_up2=args_early_ys2[(y_type_2 > top) & - (y_type_2 <= y_grenze)] - #print(y_type_2_up,x_starting_up,x_ending_up,'didid') - nodes_in = set() - for ij in range(len(x_starting_up)): - nodes_in.update(range(x_starting_up[ij], - x_ending_up[ij])) - #print(nodes_in,'nodes_in') + up = (y_mid > top) & (y_mid <= y_grenze) + args_early_ys2 = np.arange(len(y_mid)) + x_starting_up = x_starting[up] + x_ending_up = x_ending[up] + y_mid_up = y_mid[up] + y_max_up = y_max[up] + args_up2 = args_early_ys2[up] + #print(y_mid_up,x_starting_up,x_ending_up,'didid') + else: + args_early_ys2 = args_early_ys + args_up2 = args_up - if nodes_in == set(range(len(peaks_neg_tot)-1)): - pass - elif nodes_in == set(range(1, len(peaks_neg_tot)-1)): - pass - else: - #print('burdaydikh') - args_to_be_kept2=np.array(list( set(args_early_ys2)-set(args_up2) )) + nodes_in = set() + for ij in range(len(x_starting_up)): + nodes_in.update(range(x_starting_up[ij], + x_ending_up[ij])) + #print(nodes_in,'nodes_in') + #print(np.array(range(len(peaks_neg_tot)-1)),'np.array(range(len(peaks_neg_tot)-1))') - if len(args_to_be_kept2)>0: - y_type_2 = y_type_2[args_to_be_kept2] - x_starting = x_starting[args_to_be_kept2] - x_ending = x_ending[args_to_be_kept2] - y_diff_type_2 = y_diff_type_2[args_to_be_kept2] - else: - pass - #print('burdaydikh2') - elif len(y_diff_main_separator_up)==0: - nodes_in = set() - for ij in range(len(x_starting_up)): - nodes_in.update(range(x_starting_up[ij], - x_ending_up[ij])) - #print(nodes_in,'nodes_in2') - #print(np.array(range(len(peaks_neg_tot)-1)),'np.array(range(len(peaks_neg_tot)-1))') + if nodes_in == set(range(len(peaks_neg_tot)-1)): + pass + elif nodes_in == set(range(1, len(peaks_neg_tot)-1)): + pass + else: + #print('burdaydikh') + args_to_be_kept2 = np.array(list( set(args_early_ys2) - set(args_up2) )) - if nodes_in == set(range(len(peaks_neg_tot)-1)): - pass - elif nodes_in == set(range(1,len(peaks_neg_tot)-1)): - pass - else: - #print('burdaydikh') - #print(args_early_ys,'args_early_ys') - #print(args_up,'args_up') - args_to_be_kept2=np.array(list( set(args_early_ys) - set(args_up) )) - - #print(args_to_be_kept2,'args_to_be_kept2') - #print(len(y_type_2),len(x_starting),len(x_ending),len(y_diff_type_2)) - if len(args_to_be_kept2)>0: - y_type_2 = y_type_2[args_to_be_kept2] - x_starting = x_starting[args_to_be_kept2] - x_ending = x_ending[args_to_be_kept2] - y_diff_type_2 = y_diff_type_2[args_to_be_kept2] - else: - pass - #print('burdaydikh2') + if len(args_to_be_kept2): + #print(args_to_be_kept2, "args_to_be_kept2") + y_mid = y_mid[args_to_be_kept2] + x_starting = x_starting[args_to_be_kept2] + x_ending = x_ending[args_to_be_kept2] + y_max = y_max[args_to_be_kept2] #int(top) - y_lines_by_order=[] + # order multi-column separators + y_mid_by_order=[] x_start_by_order=[] x_end_by_order=[] if (reading_order_type == 1 or len(x_end_with_child_without_mother) == 0): if reading_order_type == 1: - y_lines_by_order.append(top) + # there are top-level multi-colspan horizontal separators which overlap each other + #print("adding all columns at top because of multiple overlapping mothers") + y_mid_by_order.append(top) x_start_by_order.append(0) x_end_by_order.append(len(peaks_neg_tot)-2) else: + # there are no top-level multi-colspan horizontal separators which themselves + # contain shorter multi-colspan separators #print(x_start_without_mother,x_end_without_mother,peaks_neg_tot,'dodo') columns_covered_by_mothers = set() for dj in range(len(x_start_without_mother)): @@ -1845,31 +1841,32 @@ def return_boxes_of_images_by_order_of_reading_new( range(x_start_without_mother[dj], x_end_without_mother[dj])) columns_not_covered = list(all_columns - columns_covered_by_mothers) - y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered) + - len(x_start_without_mother), - dtype=int) * top) - ##y_lines_by_order = np.append(y_lines_by_order, [top] * len(columns_not_covered)) + #print(columns_covered_by_mothers, "columns_covered_by_mothers") + #print(columns_not_covered, "columns_not_covered") + y_mid = np.append(y_mid, np.ones(len(columns_not_covered) + + len(x_start_without_mother), + dtype=int) * top) + ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered)) ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) x_starting = np.append(x_starting, np.array(columns_not_covered, int)) x_starting = np.append(x_starting, x_start_without_mother) x_ending = np.append(x_ending, np.array(columns_not_covered, int) + 1) x_ending = np.append(x_ending, x_end_without_mother) - ind_args=np.arange(len(y_type_2)) - #ind_args=np.array(ind_args) + ind_args=np.arange(len(y_mid)) #print(ind_args,'ind_args') for column in range(len(peaks_neg_tot)-1): #print(column,'column') ind_args_in_col=ind_args[x_starting==column] #print('babali2') #print(ind_args_in_col,'ind_args_in_col') - #print(len(y_type_2)) - y_column=y_type_2[ind_args_in_col] + #print(len(y_mid)) + y_mid_column=y_mid[ind_args_in_col] x_start_column=x_starting[ind_args_in_col] x_end_column=x_ending[ind_args_in_col] #print('babali3') - ind_args_col_sorted=np.argsort(y_column) - y_lines_by_order.extend(y_column[ind_args_col_sorted]) + ind_args_col_sorted=np.argsort(y_mid_column) + y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) x_start_by_order.extend(x_start_column[ind_args_col_sorted]) x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) else: @@ -1880,93 +1877,113 @@ def return_boxes_of_images_by_order_of_reading_new( range(x_start_without_mother[dj], x_end_without_mother[dj])) columns_not_covered = list(all_columns - columns_covered_by_mothers) - y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered) + len(x_start_without_mother), - dtype=int) * top) - ##y_lines_by_order = np.append(y_lines_by_order, [top] * len(columns_not_covered)) + #print(columns_covered_by_mothers, "columns_covered_by_mothers") + #print(columns_not_covered, "columns_not_covered") + y_mid = np.append(y_mid, np.ones(len(columns_not_covered) + + len(x_start_without_mother), + dtype=int) * top) + ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered)) ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) x_starting = np.append(x_starting, np.array(columns_not_covered, int)) x_starting = np.append(x_starting, x_start_without_mother) x_ending = np.append(x_ending, np.array(columns_not_covered, int) + 1) x_ending = np.append(x_ending, x_end_without_mother) - columns_covered_by_with_child_no_mothers = set() + columns_covered_by_mothers_with_child = set() for dj in range(len(x_end_with_child_without_mother)): - columns_covered_by_with_child_no_mothers.update( + columns_covered_by_mothers_with_child.update( range(x_start_with_child_without_mother[dj], x_end_with_child_without_mother[dj])) - columns_not_covered_child_no_mother = list( - all_columns - columns_covered_by_with_child_no_mothers) + #print(columns_covered_by_mothers_with_child, "columns_covered_by_mothers_with_child") + columns_not_covered_by_mothers_with_child = list( + all_columns - columns_covered_by_mothers_with_child) #indexes_to_be_spanned=[] for i_s in range(len(x_end_with_child_without_mother)): - columns_not_covered_child_no_mother.append(x_start_with_child_without_mother[i_s]) - columns_not_covered_child_no_mother = np.sort(columns_not_covered_child_no_mother) - ind_args = np.arange(len(y_type_2)) - x_end_with_child_without_mother = np.array(x_end_with_child_without_mother, int) - x_start_with_child_without_mother = np.array(x_start_with_child_without_mother, int) - for i_s_nc in columns_not_covered_child_no_mother: + columns_not_covered_by_mothers_with_child.append(x_start_with_child_without_mother[i_s]) + columns_not_covered_by_mothers_with_child = np.sort(columns_not_covered_by_mothers_with_child) + #print(columns_not_covered_by_mothers_with_child, "columns_not_covered_by_mothers_with_child") + ind_args = np.arange(len(y_mid)) + for i_s_nc in columns_not_covered_by_mothers_with_child: if i_s_nc in x_start_with_child_without_mother: + # use only seps with mother's span ("biggest") #print("i_s_nc", i_s_nc) x_end_biggest_column = \ - x_end_with_child_without_mother[x_start_with_child_without_mother==i_s_nc][0] - args_all_biggest_lines = ind_args[(x_starting==i_s_nc) & - (x_ending==x_end_biggest_column)] - y_column_nc = y_type_2[args_all_biggest_lines] - #x_start_column_nc = x_starting[args_all_biggest_lines] - #x_end_column_nc = x_ending[args_all_biggest_lines] - y_column_nc = np.sort(y_column_nc) - for i_c in range(len(y_column_nc)): + x_end_with_child_without_mother[ + x_start_with_child_without_mother == i_s_nc][0] + args_all_biggest_seps = \ + ind_args[(x_starting == i_s_nc) & + (x_ending == x_end_biggest_column)] + y_mid_column_nc = y_mid[args_all_biggest_seps] + #print("%d:%d" % (i_s_nc, x_end_biggest_column), "columns covered by mother with child") + #x_start_column_nc = x_starting[args_all_biggest_seps] + #x_end_column_nc = x_ending[args_all_biggest_seps] + y_mid_column_nc = np.sort(y_mid_column_nc) + #print(y_mid_column_nc, "y_mid_column_nc (sorted)") + for nc_top, nc_bot in pairwise(np.append(y_mid_column_nc, bot)): #print("i_c", i_c) - ind_all_lines_between_nm_wc = \ - ind_args[(y_type_2 > y_column_nc[i_c]) & - (y_type_2 < (y_column_nc[i_c+1] - if i_c < len(y_column_nc)-1 - else bot)) & + #print("%d:%d" % (nc_top, nc_bot), "y_mid_column_nc") + ind_all_seps_between_nm_wc = \ + ind_args[(y_mid > nc_top) & + (y_mid < nc_bot) & (x_starting >= i_s_nc) & (x_ending <= x_end_biggest_column)] - y_all_between_nm_wc = y_type_2[ind_all_lines_between_nm_wc] - x_starting_all_between_nm_wc = x_starting[ind_all_lines_between_nm_wc] - x_ending_all_between_nm_wc = x_ending[ind_all_lines_between_nm_wc] + y_mid_all_between_nm_wc = y_mid[ind_all_seps_between_nm_wc] + x_starting_all_between_nm_wc = x_starting[ind_all_seps_between_nm_wc] + x_ending_all_between_nm_wc = x_ending[ind_all_seps_between_nm_wc] columns_covered_by_mothers = set() - for dj in range(len(ind_all_lines_between_nm_wc)): + for dj in range(len(ind_all_seps_between_nm_wc)): columns_covered_by_mothers.update( range(x_starting_all_between_nm_wc[dj], x_ending_all_between_nm_wc[dj])) + #print(columns_covered_by_mothers, "columns_covered_by_mothers") child_columns = set(range(i_s_nc, x_end_biggest_column)) columns_not_covered = list(child_columns - columns_covered_by_mothers) + #print(child_columns, "child_columns") + #print(columns_not_covered, "columns_not_covered") - if len(ind_all_lines_between_nm_wc): + if len(ind_all_seps_between_nm_wc): biggest = np.argmax(x_ending_all_between_nm_wc - x_starting_all_between_nm_wc) + #print(ind_all_seps_between_nm_wc, "ind_all_seps_between_nm_wc") + #print(biggest, "%d:%d" % (x_starting_all_between_nm_wc[biggest], + x_ending_all_between_nm_wc[biggest]), "biggest") if columns_covered_by_mothers == set( range(x_starting_all_between_nm_wc[biggest], x_ending_all_between_nm_wc[biggest])): - # biggest accounts for all columns alone, - # longest line should be extended - lines_so_close_to_top_separator = \ - ((y_all_between_nm_wc > y_column_nc[i_c]) & - (y_all_between_nm_wc <= y_column_nc[i_c] + 500)) - if (np.count_nonzero(lines_so_close_to_top_separator) and - np.count_nonzero(lines_so_close_to_top_separator) < - len(ind_all_lines_between_nm_wc)): - y_all_between_nm_wc = \ - y_all_between_nm_wc[~lines_so_close_to_top_separator] + # single biggest accounts for all covered columns alone, + # this separator should be extended to cover all + seps_too_close_to_top_separator = \ + ((y_mid_all_between_nm_wc > nc_top) & + (y_mid_all_between_nm_wc <= nc_top + 500)) + if (np.count_nonzero(seps_too_close_to_top_separator) and + np.count_nonzero(seps_too_close_to_top_separator) < + len(ind_all_seps_between_nm_wc)): + #print(seps_too_close_to_top_separator, "seps_too_close_to_top_separator") + y_mid_all_between_nm_wc = \ + y_mid_all_between_nm_wc[~seps_too_close_to_top_separator] x_starting_all_between_nm_wc = \ - x_starting_all_between_nm_wc[~lines_so_close_to_top_separator] + x_starting_all_between_nm_wc[~seps_too_close_to_top_separator] x_ending_all_between_nm_wc = \ - x_ending_all_between_nm_wc[~lines_so_close_to_top_separator] + x_ending_all_between_nm_wc[~seps_too_close_to_top_separator] - y_all_between_nm_wc = np.append(y_all_between_nm_wc, y_column_nc[i_c]) - x_starting_all_between_nm_wc = np.append(x_starting_all_between_nm_wc, i_s_nc) - x_ending_all_between_nm_wc = np.append(x_ending_all_between_nm_wc, x_end_biggest_column) + y_mid_all_between_nm_wc = np.append( + y_mid_all_between_nm_wc, nc_top) + x_starting_all_between_nm_wc = np.append( + x_starting_all_between_nm_wc, i_s_nc) + x_ending_all_between_nm_wc = np.append( + x_ending_all_between_nm_wc, x_end_biggest_column) else: - y_all_between_nm_wc = np.append(y_all_between_nm_wc, y_column_nc[i_c]) - x_starting_all_between_nm_wc = np.append(x_starting_all_between_nm_wc, x_starting_all_between_nm_wc[biggest]) - x_ending_all_between_nm_wc = np.append(x_ending_all_between_nm_wc, x_ending_all_between_nm_wc[biggest]) + y_mid_all_between_nm_wc = np.append( + y_mid_all_between_nm_wc, nc_top) + x_starting_all_between_nm_wc = np.append( + x_starting_all_between_nm_wc, x_starting_all_between_nm_wc[biggest]) + x_ending_all_between_nm_wc = np.append( + x_ending_all_between_nm_wc, x_ending_all_between_nm_wc[biggest]) if len(columns_not_covered): - y_all_between_nm_wc = np.append( - y_all_between_nm_wc, [y_column_nc[i_c]] * len(columns_not_covered)) + y_mid_all_between_nm_wc = np.append( + y_mid_all_between_nm_wc, [nc_top] * len(columns_not_covered)) x_starting_all_between_nm_wc = np.append( x_starting_all_between_nm_wc, np.array(columns_not_covered, int)) x_ending_all_between_nm_wc = np.append( @@ -1977,52 +1994,53 @@ def return_boxes_of_images_by_order_of_reading_new( ind_args_in_col=ind_args_between[x_starting_all_between_nm_wc==column] #print('babali2') #print(ind_args_in_col,'ind_args_in_col') - #print(len(y_type_2)) - y_column=y_all_between_nm_wc[ind_args_in_col] + #print(len(y_mid)) + y_mid_column=y_mid_all_between_nm_wc[ind_args_in_col] x_start_column=x_starting_all_between_nm_wc[ind_args_in_col] x_end_column=x_ending_all_between_nm_wc[ind_args_in_col] #print('babali3') - ind_args_col_sorted=np.argsort(y_column) - y_lines_by_order.extend(y_column[ind_args_col_sorted]) + ind_args_col_sorted=np.argsort(y_mid_column) + y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) x_start_by_order.extend(x_start_column[ind_args_col_sorted]) x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) else: - #print(column,'column') + #print(i_s_nc,'column not covered by mothers with child') ind_args_in_col=ind_args[x_starting==i_s_nc] #print('babali2') #print(ind_args_in_col,'ind_args_in_col') - #print(len(y_type_2)) - y_column=y_type_2[ind_args_in_col] + #print(len(y_mid)) + y_mid_column=y_mid[ind_args_in_col] x_start_column=x_starting[ind_args_in_col] x_end_column=x_ending[ind_args_in_col] #print('babali3') - ind_args_col_sorted = np.argsort(y_column) - y_lines_by_order.extend(y_column[ind_args_col_sorted]) + ind_args_col_sorted = np.argsort(y_mid_column) + y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) x_start_by_order.extend(x_start_column[ind_args_col_sorted]) x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) - y_lines_by_order = np.array(y_lines_by_order) + # create single-column boxes from multi-column separators + y_mid_by_order = np.array(y_mid_by_order) x_start_by_order = np.array(x_start_by_order) x_end_by_order = np.array(x_end_by_order) - for il in range(len(y_lines_by_order)): + for il in range(len(y_mid_by_order)): #print(il, "il") - y_itself = y_lines_by_order[il] + y_mid_itself = y_mid_by_order[il] x_start_itself = x_start_by_order[il] x_end_itself = x_end_by_order[il] for column in range(int(x_start_itself), int(x_end_itself)+1): #print(column,'cols') - y_in_cols = y_lines_by_order[(y_itself < y_lines_by_order) & - (column >= x_start_by_order) & - (column <= x_end_by_order)] #print('burda') - y_down = y_in_cols.min(initial=bot) #print('burda2') - #print(y_in_cols,'y_in_cols') - #print(y_itself,'y_itself') + y_mid_next = y_mid_by_order[(y_mid_itself < y_mid_by_order) & + (column >= x_start_by_order) & + (column <= x_end_by_order)] + y_mid_next = y_mid_next.min(initial=bot) + #print(y_mid_next,'y_mid_next') + #print(y_mid_itself,'y_mid_itself') boxes.append([peaks_neg_tot[column], peaks_neg_tot[column+1], - y_itself, - y_down]) + y_mid_itself, + y_mid_next]) # dbg_plt(boxes[-1], "A column %d box" % (column + 1)) except: logger.exception("cannot assign boxes") @@ -2030,20 +2048,21 @@ def return_boxes_of_images_by_order_of_reading_new( top, bot]) # dbg_plt(boxes[-1], "fallback box") else: - y_lines_by_order=[] + # order multi-column separators + y_mid_by_order=[] x_start_by_order=[] x_end_by_order=[] if len(x_starting)>0: - columns_covered_by_lines_covered_more_than_2col = set() + columns_covered_by_seps_covered_more_than_2col = set() for dj in range(len(x_starting)): if set(range(x_starting[dj], x_ending[dj])) != all_columns: - columns_covered_by_lines_covered_more_than_2col.update( + columns_covered_by_seps_covered_more_than_2col.update( range(x_starting[dj], x_ending[dj])) - columns_not_covered = list(all_columns - columns_covered_by_lines_covered_more_than_2col) + columns_not_covered = list(all_columns - columns_covered_by_seps_covered_more_than_2col) - y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered) + 1, - dtype=int) * top) - ##y_lines_by_order = np.append(y_lines_by_order, [top] * len(columns_not_covered)) + y_mid = np.append(y_mid, np.ones(len(columns_not_covered) + 1, + dtype=int) * top) + ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered)) ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype)) x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1) @@ -2055,53 +2074,52 @@ def return_boxes_of_images_by_order_of_reading_new( x_ending = np.append(x_ending, x_ending[0]) else: columns_not_covered = list(all_columns) - y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered), - dtype=int) * top) - ##y_lines_by_order = np.append(y_lines_by_order, [top] * len(columns_not_covered)) + y_mid = np.append(y_mid, np.ones(len(columns_not_covered), + dtype=int) * top) + ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered)) ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype)) x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1) - ind_args = np.arange(len(y_type_2)) - + ind_args = np.arange(len(y_mid)) + for column in range(len(peaks_neg_tot)-1): #print(column,'column') ind_args_in_col=ind_args[x_starting==column] - #print(len(y_type_2)) - y_column=y_type_2[ind_args_in_col] + #print(len(y_mid)) + y_mid_column=y_mid[ind_args_in_col] x_start_column=x_starting[ind_args_in_col] x_end_column=x_ending[ind_args_in_col] - ind_args_col_sorted = np.argsort(y_column) - y_lines_by_order.extend(y_column[ind_args_col_sorted]) + ind_args_col_sorted = np.argsort(y_mid_column) + y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) x_start_by_order.extend(x_start_column[ind_args_col_sorted]) x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) - y_lines_by_order = np.array(y_lines_by_order) + # create single-column boxes from multi-column separators + y_mid_by_order = np.array(y_mid_by_order) x_start_by_order = np.array(x_start_by_order) x_end_by_order = np.array(x_end_by_order) - for il in range(len(y_lines_by_order)): + for il in range(len(y_mid_by_order)): #print(il, "il") - y_itself = y_lines_by_order[il] - #print(y_itself,'y_itself') + y_mid_itself = y_mid_by_order[il] + #print(y_mid_itself,'y_mid_itself') x_start_itself = x_start_by_order[il] x_end_itself = x_end_by_order[il] for column in range(x_start_itself, x_end_itself+1): #print(column,'cols') - y_in_cols = y_lines_by_order[(y_itself < y_lines_by_order) & - (column >= x_start_by_order) & - (column <= x_end_by_order)] #print('burda2') - #print(y_in_cols,'y_in_cols') - y_down = y_in_cols.min(initial=bot) - #print(y_down,'y_down') + y_mid_next = y_mid_by_order[(y_mid_itself < y_mid_by_order) & + (column >= x_start_by_order) & + (column <= x_end_by_order)] + #print(y_mid_next,'y_mid_next') + y_mid_next = y_mid_next.min(initial=bot) + #print(y_mid_next,'y_mid_next') boxes.append([peaks_neg_tot[column], peaks_neg_tot[column+1], - y_itself, - y_down]) + y_mid_itself, + y_mid_next]) # dbg_plt(boxes[-1], "B column %d box" % (column + 1)) - #else: - #boxes.append([ 0, regions_without_separators[:,:].shape[1] ,top, bot]) if right2left_readingorder: peaks_neg_tot_tables_new = [] @@ -2119,11 +2137,7 @@ def return_boxes_of_images_by_order_of_reading_new( peaks_neg_tot_tables = peaks_neg_tot_tables_new # show final xy-cut - # plt.imshow(regions_without_separators) - # for xmin, xmax, ymin, ymax in boxes: - # plt.gca().add_patch(patches.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, - # fill=False, linewidth=1, edgecolor='r')) - # plt.show() + # dbg_plt(None, "final XY-Cut", boxes, True) logger.debug('exit return_boxes_of_images_by_order_of_reading_new') return boxes, peaks_neg_tot_tables From 66a0e55e49e4224e38c9792d06d2468c7fe8fe90 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 24 Oct 2025 02:15:13 +0200 Subject: [PATCH 17/91] `return_boxes_of_images_by_order_of_reading_new`: avoid oversplits when y slice (`top:bot`) is not a significant part of the page, viz. less than 22% (as in `find_number_of_columns_in_document`), avoid forcing `find_num_col` to reach `num_col_classifier` (allows large headers not to be split up and thus better ordered) --- src/eynollah/utils/__init__.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index eca96f3..2017cea 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1628,7 +1628,8 @@ def return_boxes_of_images_by_order_of_reading_new( boxes=[] peaks_neg_tot_tables = [] splitter_y_new = np.array(splitter_y_new, dtype=int) - width_tot = regions_without_separators.shape[1] + height_tot, width_tot = regions_without_separators.shape + big_part = 22 * height_tot // 100 # percent height for top, bot in pairwise(splitter_y_new): # print("%d:%d" % (top, bot), 'i') # dbg_plt([0, None, top, bot], "image cut for y split %d:%d" % (top, bot)) @@ -1644,12 +1645,17 @@ def return_boxes_of_images_by_order_of_reading_new( try: num_col, peaks_neg_fin = find_num_col( regions_without_separators[top:bot], - num_col_classifier, tables, multiplier=6. if erosion_hurts else 7.) + # we do not expect to get all columns in small parts (headings etc.): + num_col_classifier if bot - top >= big_part else 1, + tables, multiplier=6. if erosion_hurts else 7.) except: peaks_neg_fin=[] num_col = 0 try: - if (len(peaks_neg_fin)+1)= big_part): # found too few columns here #print('burda') peaks_neg_fin_org = np.copy(peaks_neg_fin) From 3ebbc2d693ae14a640c3cb478b6a01cd1e42efb7 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 24 Oct 2025 02:30:39 +0200 Subject: [PATCH 18/91] `return_boxes_of_images_by_order_of_reading_new`: indent (by removing unnecessary conditional) --- src/eynollah/utils/__init__.py | 843 ++++++++++++++++----------------- 1 file changed, 421 insertions(+), 422 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 2017cea..f30d55e 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1641,241 +1641,204 @@ def return_boxes_of_images_by_order_of_reading_new( #if (len(matrix_new[:,9][matrix_new[:,9]==1]) > 0 and # np.max(matrix_new[:,8][matrix_new[:,9]==1]) >= # 0.1 * (np.abs(bot-top))): - if True: - try: - num_col, peaks_neg_fin = find_num_col( - regions_without_separators[top:bot], - # we do not expect to get all columns in small parts (headings etc.): - num_col_classifier if bot - top >= big_part else 1, - tables, multiplier=6. if erosion_hurts else 7.) - except: - peaks_neg_fin=[] - num_col = 0 - try: - if ((len(peaks_neg_fin) + 1 < num_col_classifier or - num_col_classifier == 6) and - # we do not expect to get all columns in small parts (headings etc.): - bot - top >= big_part): - # found too few columns here - #print('burda') - peaks_neg_fin_org = np.copy(peaks_neg_fin) - #print("peaks_neg_fin_org", peaks_neg_fin_org) - if len(peaks_neg_fin)==0: - num_col, peaks_neg_fin = find_num_col( - regions_without_separators[top:bot], - num_col_classifier, tables, multiplier=3.) - #print(peaks_neg_fin,'peaks_neg_fin') - peaks_neg_fin_early = [0] + peaks_neg_fin + [width_tot-1] + try: + num_col, peaks_neg_fin = find_num_col( + regions_without_separators[top:bot], + # we do not expect to get all columns in small parts (headings etc.): + num_col_classifier if bot - top >= big_part else 1, + tables, multiplier=6. if erosion_hurts else 7.) + except: + peaks_neg_fin=[] + num_col = 0 + try: + if ((len(peaks_neg_fin) + 1 < num_col_classifier or + num_col_classifier == 6) and + # we do not expect to get all columns in small parts (headings etc.): + bot - top >= big_part): + # found too few columns here + #print('burda') + peaks_neg_fin_org = np.copy(peaks_neg_fin) + #print("peaks_neg_fin_org", peaks_neg_fin_org) + if len(peaks_neg_fin)==0: + num_col, peaks_neg_fin = find_num_col( + regions_without_separators[top:bot], + num_col_classifier, tables, multiplier=3.) + #print(peaks_neg_fin,'peaks_neg_fin') + peaks_neg_fin_early = [0] + peaks_neg_fin + [width_tot-1] - #print(peaks_neg_fin_early,'burda2') - peaks_neg_fin_rev=[] - for left, right in pairwise(peaks_neg_fin_early): - # print("%d:%d" % (left, right), 'i_n') - # dbg_plt([left, right, top, bot], - # "image cut for y split %d:%d / x gap %d:%d" % ( - # top, bot, left, right)) - # plt.plot(regions_without_separators[top:bot, left:right].sum(axis=0)) - # plt.title("vertical projection (sum over y)") - # plt.show() - try: - _, peaks_neg_fin1 = find_num_col( - regions_without_separators[top:bot, left:right], - num_col_classifier, tables, multiplier=7.) - except: - peaks_neg_fin1 = [] - try: - _, peaks_neg_fin2 = find_num_col( - regions_without_separators[top:bot, left:right], - num_col_classifier, tables, multiplier=5.) - except: - peaks_neg_fin2 = [] - if len(peaks_neg_fin1) >= len(peaks_neg_fin2): - peaks_neg_fin = peaks_neg_fin1 - else: - peaks_neg_fin = peaks_neg_fin2 - # add offset to local result - peaks_neg_fin = list(np.array(peaks_neg_fin) + left) - #print(peaks_neg_fin,'peaks_neg_fin') - - peaks_neg_fin_rev.extend(peaks_neg_fin) - if right < peaks_neg_fin_early[-1]: - # all but the last column: interject the preexisting boundary - peaks_neg_fin_rev.append(right) - #print(peaks_neg_fin_rev,'peaks_neg_fin_rev') - - if len(peaks_neg_fin_rev) >= len(peaks_neg_fin_org): - peaks_neg_fin = peaks_neg_fin_rev + #print(peaks_neg_fin_early,'burda2') + peaks_neg_fin_rev=[] + for left, right in pairwise(peaks_neg_fin_early): + # print("%d:%d" % (left, right), 'i_n') + # dbg_plt([left, right, top, bot], + # "image cut for y split %d:%d / x gap %d:%d" % ( + # top, bot, left, right)) + # plt.plot(regions_without_separators[top:bot, left:right].sum(axis=0)) + # plt.title("vertical projection (sum over y)") + # plt.show() + try: + _, peaks_neg_fin1 = find_num_col( + regions_without_separators[top:bot, left:right], + num_col_classifier, tables, multiplier=7.) + except: + peaks_neg_fin1 = [] + try: + _, peaks_neg_fin2 = find_num_col( + regions_without_separators[top:bot, left:right], + num_col_classifier, tables, multiplier=5.) + except: + peaks_neg_fin2 = [] + if len(peaks_neg_fin1) >= len(peaks_neg_fin2): + peaks_neg_fin = peaks_neg_fin1 else: - peaks_neg_fin = peaks_neg_fin_org - num_col = len(peaks_neg_fin) + peaks_neg_fin = peaks_neg_fin2 + # add offset to local result + peaks_neg_fin = list(np.array(peaks_neg_fin) + left) #print(peaks_neg_fin,'peaks_neg_fin') - except: - logger.exception("cannot find peaks consistent with columns") - #num_col, peaks_neg_fin = find_num_col( - # regions_without_separators[top:bot,:], - # multiplier=7.0) - x_min_hor_some=matrix_new[:,2][ (matrix_new[:,9]==0) ] - x_max_hor_some=matrix_new[:,3][ (matrix_new[:,9]==0) ] - cy_hor_some=matrix_new[:,5][ (matrix_new[:,9]==0) ] - y_max_hor_some=matrix_new[:,7][ (matrix_new[:,9]==0) ] - if right2left_readingorder: - x_max_hor_some_new = width_tot - x_min_hor_some - x_min_hor_some_new = width_tot - x_max_hor_some - x_min_hor_some =list(np.copy(x_min_hor_some_new)) - x_max_hor_some =list(np.copy(x_max_hor_some_new)) + peaks_neg_fin_rev.extend(peaks_neg_fin) + if right < peaks_neg_fin_early[-1]: + # all but the last column: interject the preexisting boundary + peaks_neg_fin_rev.append(right) + #print(peaks_neg_fin_rev,'peaks_neg_fin_rev') - peaks_neg_tot = np.array([0] + peaks_neg_fin + [width_tot]) - #print(peaks_neg_tot,'peaks_neg_tot') - peaks_neg_tot_tables.append(peaks_neg_tot) + if len(peaks_neg_fin_rev) >= len(peaks_neg_fin_org): + peaks_neg_fin = peaks_neg_fin_rev + else: + peaks_neg_fin = peaks_neg_fin_org + num_col = len(peaks_neg_fin) + #print(peaks_neg_fin,'peaks_neg_fin') + except: + logger.exception("cannot find peaks consistent with columns") + #num_col, peaks_neg_fin = find_num_col( + # regions_without_separators[top:bot,:], + # multiplier=7.0) + x_min_hor_some=matrix_new[:,2][ (matrix_new[:,9]==0) ] + x_max_hor_some=matrix_new[:,3][ (matrix_new[:,9]==0) ] + cy_hor_some=matrix_new[:,5][ (matrix_new[:,9]==0) ] + y_max_hor_some=matrix_new[:,7][ (matrix_new[:,9]==0) ] - all_columns = set(range(len(peaks_neg_tot) - 1)) - #print("all_columns", all_columns) + if right2left_readingorder: + x_max_hor_some_new = width_tot - x_min_hor_some + x_min_hor_some_new = width_tot - x_max_hor_some + x_min_hor_some =list(np.copy(x_min_hor_some_new)) + x_max_hor_some =list(np.copy(x_max_hor_some_new)) - reading_order_type, x_starting, x_ending, y_mid, y_max, \ - y_mid_without_mother, x_start_without_mother, x_end_without_mother, \ - there_is_sep_with_child, \ - y_mid_with_child_without_mother, x_start_with_child_without_mother, x_end_with_child_without_mother, \ - new_main_sep_y = return_x_start_end_mothers_childs_and_type_of_reading_order( - x_min_hor_some, x_max_hor_some, cy_hor_some, peaks_neg_tot, y_max_hor_some) + peaks_neg_tot = np.array([0] + peaks_neg_fin + [width_tot]) + #print(peaks_neg_tot,'peaks_neg_tot') + peaks_neg_tot_tables.append(peaks_neg_tot) - # show multi-column separators - # dbg_plt([0, None, top, bot], "multi-column separators in current split", - # list(zip(peaks_neg_tot[x_starting], peaks_neg_tot[x_ending], - # y_mid - top, y_max - top)), True) + all_columns = set(range(len(peaks_neg_tot) - 1)) + #print("all_columns", all_columns) - if (reading_order_type == 1 or - len(y_mid_without_mother) >= 2 or - there_is_sep_with_child == 1): - # there are top-level multi-colspan horizontal separators which overlap each other - # or multiple top-level multi-colspan horizontal separators - # or multi-colspan horizontal separators shorter than their respective top-level: - # todo: explain how this is dealt with - try: - y_grenze = top + 300 - up = (y_mid > top) & (y_mid <= y_grenze) + reading_order_type, x_starting, x_ending, y_mid, y_max, \ + y_mid_without_mother, x_start_without_mother, x_end_without_mother, \ + there_is_sep_with_child, \ + y_mid_with_child_without_mother, x_start_with_child_without_mother, x_end_with_child_without_mother, \ + new_main_sep_y = return_x_start_end_mothers_childs_and_type_of_reading_order( + x_min_hor_some, x_max_hor_some, cy_hor_some, peaks_neg_tot, y_max_hor_some) - args_early_ys=np.arange(len(y_mid)) - #print(args_early_ys,'args_early_ys') - #print(y_mid,'y_mid') + # show multi-column separators + # dbg_plt([0, None, top, bot], "multi-column separators in current split", + # list(zip(peaks_neg_tot[x_starting], peaks_neg_tot[x_ending], + # y_mid - top, y_max - top)), True) - x_starting_up = x_starting[up] - x_ending_up = x_ending[up] - y_mid_up = y_mid[up] - y_max_up = y_max[up] - args_up = args_early_ys[up] - #print(args_up,'args_up') - #print(y_mid_up,'y_mid_up') - #check if there is a big separator in this y_mains0 - if len(y_mid_up) > 0: - # is there a separator with full-width span? - main_separator = (x_starting_up == 0) & (x_ending_up == len(peaks_neg_tot) - 1) - y_mid_main_separator_up = y_mid_up[main_separator] - y_max_main_separator_up = y_max_up[main_separator] - args_main_to_deleted = args_up[main_separator] - #print(y_mid_main_separator_up,y_max_main_separator_up,args_main_to_deleted,'fffffjammmm') - if len(y_max_main_separator_up): - args_to_be_kept = np.array(list( set(args_early_ys) - set(args_main_to_deleted) )) - #print(args_to_be_kept,'args_to_be_kept') - boxes.append([0, peaks_neg_tot[-1], - top, y_max_main_separator_up.max()]) - # dbg_plt(boxes[-1], "near top main separator box") - top = y_max_main_separator_up.max() + if (reading_order_type == 1 or + len(y_mid_without_mother) >= 2 or + there_is_sep_with_child == 1): + # there are top-level multi-colspan horizontal separators which overlap each other + # or multiple top-level multi-colspan horizontal separators + # or multi-colspan horizontal separators shorter than their respective top-level: + # todo: explain how this is dealt with + try: + y_grenze = top + 300 + up = (y_mid > top) & (y_mid <= y_grenze) - #print(top,'top') - y_mid = y_mid[args_to_be_kept] - x_starting = x_starting[args_to_be_kept] - x_ending = x_ending[args_to_be_kept] - y_max = y_max[args_to_be_kept] + args_early_ys=np.arange(len(y_mid)) + #print(args_early_ys,'args_early_ys') + #print(y_mid,'y_mid') - #print('galdiha') - y_grenze = top + 200 - up = (y_mid > top) & (y_mid <= y_grenze) - args_early_ys2 = np.arange(len(y_mid)) - x_starting_up = x_starting[up] - x_ending_up = x_ending[up] - y_mid_up = y_mid[up] - y_max_up = y_max[up] - args_up2 = args_early_ys2[up] - #print(y_mid_up,x_starting_up,x_ending_up,'didid') - else: - args_early_ys2 = args_early_ys - args_up2 = args_up + x_starting_up = x_starting[up] + x_ending_up = x_ending[up] + y_mid_up = y_mid[up] + y_max_up = y_max[up] + args_up = args_early_ys[up] + #print(args_up,'args_up') + #print(y_mid_up,'y_mid_up') + #check if there is a big separator in this y_mains0 + if len(y_mid_up) > 0: + # is there a separator with full-width span? + main_separator = (x_starting_up == 0) & (x_ending_up == len(peaks_neg_tot) - 1) + y_mid_main_separator_up = y_mid_up[main_separator] + y_max_main_separator_up = y_max_up[main_separator] + args_main_to_deleted = args_up[main_separator] + #print(y_mid_main_separator_up,y_max_main_separator_up,args_main_to_deleted,'fffffjammmm') + if len(y_max_main_separator_up): + args_to_be_kept = np.array(list( set(args_early_ys) - set(args_main_to_deleted) )) + #print(args_to_be_kept,'args_to_be_kept') + boxes.append([0, peaks_neg_tot[-1], + top, y_max_main_separator_up.max()]) + # dbg_plt(boxes[-1], "near top main separator box") + top = y_max_main_separator_up.max() - nodes_in = set() - for ij in range(len(x_starting_up)): - nodes_in.update(range(x_starting_up[ij], - x_ending_up[ij])) - #print(nodes_in,'nodes_in') - #print(np.array(range(len(peaks_neg_tot)-1)),'np.array(range(len(peaks_neg_tot)-1))') + #print(top,'top') + y_mid = y_mid[args_to_be_kept] + x_starting = x_starting[args_to_be_kept] + x_ending = x_ending[args_to_be_kept] + y_max = y_max[args_to_be_kept] - if nodes_in == set(range(len(peaks_neg_tot)-1)): - pass - elif nodes_in == set(range(1, len(peaks_neg_tot)-1)): - pass - else: - #print('burdaydikh') - args_to_be_kept2 = np.array(list( set(args_early_ys2) - set(args_up2) )) - - if len(args_to_be_kept2): - #print(args_to_be_kept2, "args_to_be_kept2") - y_mid = y_mid[args_to_be_kept2] - x_starting = x_starting[args_to_be_kept2] - x_ending = x_ending[args_to_be_kept2] - y_max = y_max[args_to_be_kept2] - - #int(top) - # order multi-column separators - y_mid_by_order=[] - x_start_by_order=[] - x_end_by_order=[] - if (reading_order_type == 1 or - len(x_end_with_child_without_mother) == 0): - if reading_order_type == 1: - # there are top-level multi-colspan horizontal separators which overlap each other - #print("adding all columns at top because of multiple overlapping mothers") - y_mid_by_order.append(top) - x_start_by_order.append(0) - x_end_by_order.append(len(peaks_neg_tot)-2) - else: - # there are no top-level multi-colspan horizontal separators which themselves - # contain shorter multi-colspan separators - #print(x_start_without_mother,x_end_without_mother,peaks_neg_tot,'dodo') - columns_covered_by_mothers = set() - for dj in range(len(x_start_without_mother)): - columns_covered_by_mothers.update( - range(x_start_without_mother[dj], - x_end_without_mother[dj])) - columns_not_covered = list(all_columns - columns_covered_by_mothers) - #print(columns_covered_by_mothers, "columns_covered_by_mothers") - #print(columns_not_covered, "columns_not_covered") - y_mid = np.append(y_mid, np.ones(len(columns_not_covered) + - len(x_start_without_mother), - dtype=int) * top) - ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered)) - ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) - x_starting = np.append(x_starting, np.array(columns_not_covered, int)) - x_starting = np.append(x_starting, x_start_without_mother) - x_ending = np.append(x_ending, np.array(columns_not_covered, int) + 1) - x_ending = np.append(x_ending, x_end_without_mother) - - ind_args=np.arange(len(y_mid)) - #print(ind_args,'ind_args') - for column in range(len(peaks_neg_tot)-1): - #print(column,'column') - ind_args_in_col=ind_args[x_starting==column] - #print('babali2') - #print(ind_args_in_col,'ind_args_in_col') - #print(len(y_mid)) - y_mid_column=y_mid[ind_args_in_col] - x_start_column=x_starting[ind_args_in_col] - x_end_column=x_ending[ind_args_in_col] - #print('babali3') - ind_args_col_sorted=np.argsort(y_mid_column) - y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) - x_start_by_order.extend(x_start_column[ind_args_col_sorted]) - x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) + #print('galdiha') + y_grenze = top + 200 + up = (y_mid > top) & (y_mid <= y_grenze) + args_early_ys2 = np.arange(len(y_mid)) + x_starting_up = x_starting[up] + x_ending_up = x_ending[up] + y_mid_up = y_mid[up] + y_max_up = y_max[up] + args_up2 = args_early_ys2[up] + #print(y_mid_up,x_starting_up,x_ending_up,'didid') else: + args_early_ys2 = args_early_ys + args_up2 = args_up + + nodes_in = set() + for ij in range(len(x_starting_up)): + nodes_in.update(range(x_starting_up[ij], + x_ending_up[ij])) + #print(nodes_in,'nodes_in') + #print(np.array(range(len(peaks_neg_tot)-1)),'np.array(range(len(peaks_neg_tot)-1))') + + if nodes_in == set(range(len(peaks_neg_tot)-1)): + pass + elif nodes_in == set(range(1, len(peaks_neg_tot)-1)): + pass + else: + #print('burdaydikh') + args_to_be_kept2 = np.array(list( set(args_early_ys2) - set(args_up2) )) + + if len(args_to_be_kept2): + #print(args_to_be_kept2, "args_to_be_kept2") + y_mid = y_mid[args_to_be_kept2] + x_starting = x_starting[args_to_be_kept2] + x_ending = x_ending[args_to_be_kept2] + y_max = y_max[args_to_be_kept2] + + #int(top) + # order multi-column separators + y_mid_by_order=[] + x_start_by_order=[] + x_end_by_order=[] + if (reading_order_type == 1 or + len(x_end_with_child_without_mother) == 0): + if reading_order_type == 1: + # there are top-level multi-colspan horizontal separators which overlap each other + #print("adding all columns at top because of multiple overlapping mothers") + y_mid_by_order.append(top) + x_start_by_order.append(0) + x_end_by_order.append(len(peaks_neg_tot)-2) + else: + # there are no top-level multi-colspan horizontal separators which themselves + # contain shorter multi-colspan separators #print(x_start_without_mother,x_end_without_mother,peaks_neg_tot,'dodo') columns_covered_by_mothers = set() for dj in range(len(x_start_without_mother)): @@ -1895,212 +1858,170 @@ def return_boxes_of_images_by_order_of_reading_new( x_ending = np.append(x_ending, np.array(columns_not_covered, int) + 1) x_ending = np.append(x_ending, x_end_without_mother) - columns_covered_by_mothers_with_child = set() - for dj in range(len(x_end_with_child_without_mother)): - columns_covered_by_mothers_with_child.update( - range(x_start_with_child_without_mother[dj], - x_end_with_child_without_mother[dj])) - #print(columns_covered_by_mothers_with_child, "columns_covered_by_mothers_with_child") - columns_not_covered_by_mothers_with_child = list( - all_columns - columns_covered_by_mothers_with_child) - #indexes_to_be_spanned=[] - for i_s in range(len(x_end_with_child_without_mother)): - columns_not_covered_by_mothers_with_child.append(x_start_with_child_without_mother[i_s]) - columns_not_covered_by_mothers_with_child = np.sort(columns_not_covered_by_mothers_with_child) - #print(columns_not_covered_by_mothers_with_child, "columns_not_covered_by_mothers_with_child") - ind_args = np.arange(len(y_mid)) - for i_s_nc in columns_not_covered_by_mothers_with_child: - if i_s_nc in x_start_with_child_without_mother: - # use only seps with mother's span ("biggest") - #print("i_s_nc", i_s_nc) - x_end_biggest_column = \ - x_end_with_child_without_mother[ - x_start_with_child_without_mother == i_s_nc][0] - args_all_biggest_seps = \ - ind_args[(x_starting == i_s_nc) & - (x_ending == x_end_biggest_column)] - y_mid_column_nc = y_mid[args_all_biggest_seps] - #print("%d:%d" % (i_s_nc, x_end_biggest_column), "columns covered by mother with child") - #x_start_column_nc = x_starting[args_all_biggest_seps] - #x_end_column_nc = x_ending[args_all_biggest_seps] - y_mid_column_nc = np.sort(y_mid_column_nc) - #print(y_mid_column_nc, "y_mid_column_nc (sorted)") - for nc_top, nc_bot in pairwise(np.append(y_mid_column_nc, bot)): - #print("i_c", i_c) - #print("%d:%d" % (nc_top, nc_bot), "y_mid_column_nc") - ind_all_seps_between_nm_wc = \ - ind_args[(y_mid > nc_top) & - (y_mid < nc_bot) & - (x_starting >= i_s_nc) & - (x_ending <= x_end_biggest_column)] - y_mid_all_between_nm_wc = y_mid[ind_all_seps_between_nm_wc] - x_starting_all_between_nm_wc = x_starting[ind_all_seps_between_nm_wc] - x_ending_all_between_nm_wc = x_ending[ind_all_seps_between_nm_wc] - - columns_covered_by_mothers = set() - for dj in range(len(ind_all_seps_between_nm_wc)): - columns_covered_by_mothers.update( - range(x_starting_all_between_nm_wc[dj], - x_ending_all_between_nm_wc[dj])) - #print(columns_covered_by_mothers, "columns_covered_by_mothers") - child_columns = set(range(i_s_nc, x_end_biggest_column)) - columns_not_covered = list(child_columns - columns_covered_by_mothers) - #print(child_columns, "child_columns") - #print(columns_not_covered, "columns_not_covered") - - if len(ind_all_seps_between_nm_wc): - biggest = np.argmax(x_ending_all_between_nm_wc - - x_starting_all_between_nm_wc) - #print(ind_all_seps_between_nm_wc, "ind_all_seps_between_nm_wc") - #print(biggest, "%d:%d" % (x_starting_all_between_nm_wc[biggest], - x_ending_all_between_nm_wc[biggest]), "biggest") - if columns_covered_by_mothers == set( - range(x_starting_all_between_nm_wc[biggest], - x_ending_all_between_nm_wc[biggest])): - # single biggest accounts for all covered columns alone, - # this separator should be extended to cover all - seps_too_close_to_top_separator = \ - ((y_mid_all_between_nm_wc > nc_top) & - (y_mid_all_between_nm_wc <= nc_top + 500)) - if (np.count_nonzero(seps_too_close_to_top_separator) and - np.count_nonzero(seps_too_close_to_top_separator) < - len(ind_all_seps_between_nm_wc)): - #print(seps_too_close_to_top_separator, "seps_too_close_to_top_separator") - y_mid_all_between_nm_wc = \ - y_mid_all_between_nm_wc[~seps_too_close_to_top_separator] - x_starting_all_between_nm_wc = \ - x_starting_all_between_nm_wc[~seps_too_close_to_top_separator] - x_ending_all_between_nm_wc = \ - x_ending_all_between_nm_wc[~seps_too_close_to_top_separator] - - y_mid_all_between_nm_wc = np.append( - y_mid_all_between_nm_wc, nc_top) - x_starting_all_between_nm_wc = np.append( - x_starting_all_between_nm_wc, i_s_nc) - x_ending_all_between_nm_wc = np.append( - x_ending_all_between_nm_wc, x_end_biggest_column) - else: - y_mid_all_between_nm_wc = np.append( - y_mid_all_between_nm_wc, nc_top) - x_starting_all_between_nm_wc = np.append( - x_starting_all_between_nm_wc, x_starting_all_between_nm_wc[biggest]) - x_ending_all_between_nm_wc = np.append( - x_ending_all_between_nm_wc, x_ending_all_between_nm_wc[biggest]) - - if len(columns_not_covered): - y_mid_all_between_nm_wc = np.append( - y_mid_all_between_nm_wc, [nc_top] * len(columns_not_covered)) - x_starting_all_between_nm_wc = np.append( - x_starting_all_between_nm_wc, np.array(columns_not_covered, int)) - x_ending_all_between_nm_wc = np.append( - x_ending_all_between_nm_wc, np.array(columns_not_covered, int) + 1) - - ind_args_between=np.arange(len(x_ending_all_between_nm_wc)) - for column in range(int(i_s_nc), int(x_end_biggest_column)): - ind_args_in_col=ind_args_between[x_starting_all_between_nm_wc==column] - #print('babali2') - #print(ind_args_in_col,'ind_args_in_col') - #print(len(y_mid)) - y_mid_column=y_mid_all_between_nm_wc[ind_args_in_col] - x_start_column=x_starting_all_between_nm_wc[ind_args_in_col] - x_end_column=x_ending_all_between_nm_wc[ind_args_in_col] - #print('babali3') - ind_args_col_sorted=np.argsort(y_mid_column) - y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) - x_start_by_order.extend(x_start_column[ind_args_col_sorted]) - x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) - else: - #print(i_s_nc,'column not covered by mothers with child') - ind_args_in_col=ind_args[x_starting==i_s_nc] - #print('babali2') - #print(ind_args_in_col,'ind_args_in_col') - #print(len(y_mid)) - y_mid_column=y_mid[ind_args_in_col] - x_start_column=x_starting[ind_args_in_col] - x_end_column=x_ending[ind_args_in_col] - #print('babali3') - ind_args_col_sorted = np.argsort(y_mid_column) - y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) - x_start_by_order.extend(x_start_column[ind_args_col_sorted]) - x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) - - # create single-column boxes from multi-column separators - y_mid_by_order = np.array(y_mid_by_order) - x_start_by_order = np.array(x_start_by_order) - x_end_by_order = np.array(x_end_by_order) - for il in range(len(y_mid_by_order)): - #print(il, "il") - y_mid_itself = y_mid_by_order[il] - x_start_itself = x_start_by_order[il] - x_end_itself = x_end_by_order[il] - for column in range(int(x_start_itself), int(x_end_itself)+1): - #print(column,'cols') - #print('burda') - #print('burda2') - y_mid_next = y_mid_by_order[(y_mid_itself < y_mid_by_order) & - (column >= x_start_by_order) & - (column <= x_end_by_order)] - y_mid_next = y_mid_next.min(initial=bot) - #print(y_mid_next,'y_mid_next') - #print(y_mid_itself,'y_mid_itself') - boxes.append([peaks_neg_tot[column], - peaks_neg_tot[column+1], - y_mid_itself, - y_mid_next]) - # dbg_plt(boxes[-1], "A column %d box" % (column + 1)) - except: - logger.exception("cannot assign boxes") - boxes.append([0, peaks_neg_tot[len(peaks_neg_tot)-1], - top, bot]) - # dbg_plt(boxes[-1], "fallback box") - else: - # order multi-column separators - y_mid_by_order=[] - x_start_by_order=[] - x_end_by_order=[] - if len(x_starting)>0: - columns_covered_by_seps_covered_more_than_2col = set() - for dj in range(len(x_starting)): - if set(range(x_starting[dj], x_ending[dj])) != all_columns: - columns_covered_by_seps_covered_more_than_2col.update( - range(x_starting[dj], x_ending[dj])) - columns_not_covered = list(all_columns - columns_covered_by_seps_covered_more_than_2col) - - y_mid = np.append(y_mid, np.ones(len(columns_not_covered) + 1, - dtype=int) * top) - ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered)) - ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) - x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype)) - x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1) - if len(new_main_sep_y) > 0: - x_starting = np.append(x_starting, 0) - x_ending = np.append(x_ending, len(peaks_neg_tot) - 1) - else: - x_starting = np.append(x_starting, x_starting[0]) - x_ending = np.append(x_ending, x_ending[0]) + ind_args=np.arange(len(y_mid)) + #print(ind_args,'ind_args') + for column in range(len(peaks_neg_tot)-1): + #print(column,'column') + ind_args_in_col=ind_args[x_starting==column] + #print('babali2') + #print(ind_args_in_col,'ind_args_in_col') + #print(len(y_mid)) + y_mid_column=y_mid[ind_args_in_col] + x_start_column=x_starting[ind_args_in_col] + x_end_column=x_ending[ind_args_in_col] + #print('babali3') + ind_args_col_sorted=np.argsort(y_mid_column) + y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) + x_start_by_order.extend(x_start_column[ind_args_col_sorted]) + x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) else: - columns_not_covered = list(all_columns) - y_mid = np.append(y_mid, np.ones(len(columns_not_covered), + #print(x_start_without_mother,x_end_without_mother,peaks_neg_tot,'dodo') + columns_covered_by_mothers = set() + for dj in range(len(x_start_without_mother)): + columns_covered_by_mothers.update( + range(x_start_without_mother[dj], + x_end_without_mother[dj])) + columns_not_covered = list(all_columns - columns_covered_by_mothers) + #print(columns_covered_by_mothers, "columns_covered_by_mothers") + #print(columns_not_covered, "columns_not_covered") + y_mid = np.append(y_mid, np.ones(len(columns_not_covered) + + len(x_start_without_mother), dtype=int) * top) ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered)) ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) - x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype)) - x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1) + x_starting = np.append(x_starting, np.array(columns_not_covered, int)) + x_starting = np.append(x_starting, x_start_without_mother) + x_ending = np.append(x_ending, np.array(columns_not_covered, int) + 1) + x_ending = np.append(x_ending, x_end_without_mother) - ind_args = np.arange(len(y_mid)) + columns_covered_by_mothers_with_child = set() + for dj in range(len(x_end_with_child_without_mother)): + columns_covered_by_mothers_with_child.update( + range(x_start_with_child_without_mother[dj], + x_end_with_child_without_mother[dj])) + #print(columns_covered_by_mothers_with_child, "columns_covered_by_mothers_with_child") + columns_not_covered_by_mothers_with_child = list( + all_columns - columns_covered_by_mothers_with_child) + #indexes_to_be_spanned=[] + for i_s in range(len(x_end_with_child_without_mother)): + columns_not_covered_by_mothers_with_child.append(x_start_with_child_without_mother[i_s]) + columns_not_covered_by_mothers_with_child = np.sort(columns_not_covered_by_mothers_with_child) + #print(columns_not_covered_by_mothers_with_child, "columns_not_covered_by_mothers_with_child") + ind_args = np.arange(len(y_mid)) + for i_s_nc in columns_not_covered_by_mothers_with_child: + if i_s_nc in x_start_with_child_without_mother: + # use only seps with mother's span ("biggest") + #print("i_s_nc", i_s_nc) + x_end_biggest_column = \ + x_end_with_child_without_mother[ + x_start_with_child_without_mother == i_s_nc][0] + args_all_biggest_seps = \ + ind_args[(x_starting == i_s_nc) & + (x_ending == x_end_biggest_column)] + y_mid_column_nc = y_mid[args_all_biggest_seps] + #print("%d:%d" % (i_s_nc, x_end_biggest_column), "columns covered by mother with child") + #x_start_column_nc = x_starting[args_all_biggest_seps] + #x_end_column_nc = x_ending[args_all_biggest_seps] + y_mid_column_nc = np.sort(y_mid_column_nc) + #print(y_mid_column_nc, "y_mid_column_nc (sorted)") + for nc_top, nc_bot in pairwise(np.append(y_mid_column_nc, bot)): + #print("i_c", i_c) + #print("%d:%d" % (nc_top, nc_bot), "y_mid_column_nc") + ind_all_seps_between_nm_wc = \ + ind_args[(y_mid > nc_top) & + (y_mid < nc_bot) & + (x_starting >= i_s_nc) & + (x_ending <= x_end_biggest_column)] + y_mid_all_between_nm_wc = y_mid[ind_all_seps_between_nm_wc] + x_starting_all_between_nm_wc = x_starting[ind_all_seps_between_nm_wc] + x_ending_all_between_nm_wc = x_ending[ind_all_seps_between_nm_wc] - for column in range(len(peaks_neg_tot)-1): - #print(column,'column') - ind_args_in_col=ind_args[x_starting==column] - #print(len(y_mid)) - y_mid_column=y_mid[ind_args_in_col] - x_start_column=x_starting[ind_args_in_col] - x_end_column=x_ending[ind_args_in_col] + columns_covered_by_mothers = set() + for dj in range(len(ind_all_seps_between_nm_wc)): + columns_covered_by_mothers.update( + range(x_starting_all_between_nm_wc[dj], + x_ending_all_between_nm_wc[dj])) + #print(columns_covered_by_mothers, "columns_covered_by_mothers") + child_columns = set(range(i_s_nc, x_end_biggest_column)) + columns_not_covered = list(child_columns - columns_covered_by_mothers) + #print(child_columns, "child_columns") + #print(columns_not_covered, "columns_not_covered") - ind_args_col_sorted = np.argsort(y_mid_column) - y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) - x_start_by_order.extend(x_start_column[ind_args_col_sorted]) - x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) + if len(ind_all_seps_between_nm_wc): + biggest = np.argmax(x_ending_all_between_nm_wc - + x_starting_all_between_nm_wc) + #print(ind_all_seps_between_nm_wc, "ind_all_seps_between_nm_wc") + #print(biggest, "%d:%d" % (x_starting_all_between_nm_wc[biggest], + x_ending_all_between_nm_wc[biggest]), "biggest") + if columns_covered_by_mothers == set( + range(x_starting_all_between_nm_wc[biggest], + x_ending_all_between_nm_wc[biggest])): + # single biggest accounts for all covered columns alone, + # this separator should be extended to cover all + seps_too_close_to_top_separator = \ + ((y_mid_all_between_nm_wc > nc_top) & + (y_mid_all_between_nm_wc <= nc_top + 500)) + if (np.count_nonzero(seps_too_close_to_top_separator) and + np.count_nonzero(seps_too_close_to_top_separator) < + len(ind_all_seps_between_nm_wc)): + #print(seps_too_close_to_top_separator, "seps_too_close_to_top_separator") + y_mid_all_between_nm_wc = \ + y_mid_all_between_nm_wc[~seps_too_close_to_top_separator] + x_starting_all_between_nm_wc = \ + x_starting_all_between_nm_wc[~seps_too_close_to_top_separator] + x_ending_all_between_nm_wc = \ + x_ending_all_between_nm_wc[~seps_too_close_to_top_separator] + + y_mid_all_between_nm_wc = np.append( + y_mid_all_between_nm_wc, nc_top) + x_starting_all_between_nm_wc = np.append( + x_starting_all_between_nm_wc, i_s_nc) + x_ending_all_between_nm_wc = np.append( + x_ending_all_between_nm_wc, x_end_biggest_column) + else: + y_mid_all_between_nm_wc = np.append( + y_mid_all_between_nm_wc, nc_top) + x_starting_all_between_nm_wc = np.append( + x_starting_all_between_nm_wc, x_starting_all_between_nm_wc[biggest]) + x_ending_all_between_nm_wc = np.append( + x_ending_all_between_nm_wc, x_ending_all_between_nm_wc[biggest]) + + if len(columns_not_covered): + y_mid_all_between_nm_wc = np.append( + y_mid_all_between_nm_wc, [nc_top] * len(columns_not_covered)) + x_starting_all_between_nm_wc = np.append( + x_starting_all_between_nm_wc, np.array(columns_not_covered, int)) + x_ending_all_between_nm_wc = np.append( + x_ending_all_between_nm_wc, np.array(columns_not_covered, int) + 1) + + ind_args_between=np.arange(len(x_ending_all_between_nm_wc)) + for column in range(int(i_s_nc), int(x_end_biggest_column)): + ind_args_in_col=ind_args_between[x_starting_all_between_nm_wc==column] + #print('babali2') + #print(ind_args_in_col,'ind_args_in_col') + #print(len(y_mid)) + y_mid_column=y_mid_all_between_nm_wc[ind_args_in_col] + x_start_column=x_starting_all_between_nm_wc[ind_args_in_col] + x_end_column=x_ending_all_between_nm_wc[ind_args_in_col] + #print('babali3') + ind_args_col_sorted=np.argsort(y_mid_column) + y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) + x_start_by_order.extend(x_start_column[ind_args_col_sorted]) + x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) + else: + #print(i_s_nc,'column not covered by mothers with child') + ind_args_in_col=ind_args[x_starting==i_s_nc] + #print('babali2') + #print(ind_args_in_col,'ind_args_in_col') + #print(len(y_mid)) + y_mid_column=y_mid[ind_args_in_col] + x_start_column=x_starting[ind_args_in_col] + x_end_column=x_ending[ind_args_in_col] + #print('babali3') + ind_args_col_sorted = np.argsort(y_mid_column) + y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) + x_start_by_order.extend(x_start_column[ind_args_col_sorted]) + x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) # create single-column boxes from multi-column separators y_mid_by_order = np.array(y_mid_by_order) @@ -2109,23 +2030,101 @@ def return_boxes_of_images_by_order_of_reading_new( for il in range(len(y_mid_by_order)): #print(il, "il") y_mid_itself = y_mid_by_order[il] - #print(y_mid_itself,'y_mid_itself') x_start_itself = x_start_by_order[il] x_end_itself = x_end_by_order[il] - for column in range(x_start_itself, x_end_itself+1): + for column in range(int(x_start_itself), int(x_end_itself)+1): #print(column,'cols') + #print('burda') #print('burda2') y_mid_next = y_mid_by_order[(y_mid_itself < y_mid_by_order) & (column >= x_start_by_order) & (column <= x_end_by_order)] - #print(y_mid_next,'y_mid_next') y_mid_next = y_mid_next.min(initial=bot) #print(y_mid_next,'y_mid_next') + #print(y_mid_itself,'y_mid_itself') boxes.append([peaks_neg_tot[column], peaks_neg_tot[column+1], y_mid_itself, y_mid_next]) - # dbg_plt(boxes[-1], "B column %d box" % (column + 1)) + # dbg_plt(boxes[-1], "A column %d box" % (column + 1)) + except: + logger.exception("cannot assign boxes") + boxes.append([0, peaks_neg_tot[len(peaks_neg_tot)-1], + top, bot]) + # dbg_plt(boxes[-1], "fallback box") + else: + # order multi-column separators + y_mid_by_order=[] + x_start_by_order=[] + x_end_by_order=[] + if len(x_starting)>0: + columns_covered_by_seps_covered_more_than_2col = set() + for dj in range(len(x_starting)): + if set(range(x_starting[dj], x_ending[dj])) != all_columns: + columns_covered_by_seps_covered_more_than_2col.update( + range(x_starting[dj], x_ending[dj])) + columns_not_covered = list(all_columns - columns_covered_by_seps_covered_more_than_2col) + + y_mid = np.append(y_mid, np.ones(len(columns_not_covered) + 1, + dtype=int) * top) + ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered)) + ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) + x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype)) + x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1) + if len(new_main_sep_y) > 0: + x_starting = np.append(x_starting, 0) + x_ending = np.append(x_ending, len(peaks_neg_tot) - 1) + else: + x_starting = np.append(x_starting, x_starting[0]) + x_ending = np.append(x_ending, x_ending[0]) + else: + columns_not_covered = list(all_columns) + y_mid = np.append(y_mid, np.ones(len(columns_not_covered), + dtype=int) * top) + ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered)) + ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) + x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype)) + x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1) + + ind_args = np.arange(len(y_mid)) + + for column in range(len(peaks_neg_tot)-1): + #print(column,'column') + ind_args_in_col=ind_args[x_starting==column] + #print(len(y_mid)) + y_mid_column=y_mid[ind_args_in_col] + x_start_column=x_starting[ind_args_in_col] + x_end_column=x_ending[ind_args_in_col] + + ind_args_col_sorted = np.argsort(y_mid_column) + y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) + x_start_by_order.extend(x_start_column[ind_args_col_sorted]) + x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) + + # create single-column boxes from multi-column separators + y_mid_by_order = np.array(y_mid_by_order) + x_start_by_order = np.array(x_start_by_order) + x_end_by_order = np.array(x_end_by_order) + for il in range(len(y_mid_by_order)): + #print(il, "il") + y_mid_itself = y_mid_by_order[il] + #print(y_mid_itself,'y_mid_itself') + x_start_itself = x_start_by_order[il] + x_end_itself = x_end_by_order[il] + for column in range(x_start_itself, x_end_itself+1): + #print(column,'cols') + #print('burda2') + y_mid_next = y_mid_by_order[(y_mid_itself < y_mid_by_order) & + (column >= x_start_by_order) & + (column <= x_end_by_order)] + #print(y_mid_next,'y_mid_next') + y_mid_next = y_mid_next.min(initial=bot) + #print(y_mid_next,'y_mid_next') + boxes.append([peaks_neg_tot[column], + peaks_neg_tot[column+1], + y_mid_itself, + y_mid_next]) + # dbg_plt(boxes[-1], "B column %d box" % (column + 1)) if right2left_readingorder: peaks_neg_tot_tables_new = [] From a2a9fe51175cfd11bc62d1e917bf79b299a7846e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 24 Oct 2025 02:35:04 +0200 Subject: [PATCH 19/91] `delete_separator_around`: simplify, eynollah: identifiers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - use array instead of list operations - rename identifiers: - `pixel` → `label` - `line` → `sep` --- src/eynollah/eynollah.py | 104 ++++++++++++++++++--------------------- 1 file changed, 47 insertions(+), 57 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 08ffed7..eee3777 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -2669,45 +2669,35 @@ class Eynollah: return layout_org, contours_new - def delete_separator_around(self, spliter_y,peaks_neg,image_by_region, pixel_line, pixel_table): + def delete_separator_around(self, splitter_y, peaks_neg, image_by_region, label_seps, label_table): # format of subboxes: box=[x1, x2 , y1, y2] pix_del = 100 - if len(image_by_region.shape)==3: - for i in range(len(spliter_y)-1): - for j in range(1,len(peaks_neg[i])-1): - ys = slice(int(spliter_y[i]), - int(spliter_y[i+1])) - xs = slice(peaks_neg[i][j] - pix_del, - peaks_neg[i][j] + pix_del) - image_by_region[ys,xs,0][image_by_region[ys,xs,0]==pixel_line] = 0 - image_by_region[ys,xs,0][image_by_region[ys,xs,1]==pixel_line] = 0 - image_by_region[ys,xs,0][image_by_region[ys,xs,2]==pixel_line] = 0 - - image_by_region[ys,xs,0][image_by_region[ys,xs,0]==pixel_table] = 0 - image_by_region[ys,xs,0][image_by_region[ys,xs,1]==pixel_table] = 0 - image_by_region[ys,xs,0][image_by_region[ys,xs,2]==pixel_table] = 0 - else: - for i in range(len(spliter_y)-1): - for j in range(1,len(peaks_neg[i])-1): - ys = slice(int(spliter_y[i]), - int(spliter_y[i+1])) - xs = slice(peaks_neg[i][j] - pix_del, - peaks_neg[i][j] + pix_del) - image_by_region[ys,xs][image_by_region[ys,xs]==pixel_line] = 0 - image_by_region[ys,xs][image_by_region[ys,xs]==pixel_table] = 0 + for i in range(len(splitter_y)-1): + for j in range(1,len(peaks_neg[i])-1): + where = np.index_exp[splitter_y[i]: + splitter_y[i+1], + peaks_neg[i][j] - pix_del: + peaks_neg[i][j] + pix_del, + :] + if image_by_region.ndim < 3: + where = where[:2] + else: + print("image_by_region ndim is 3!") # rs + image_by_region[where][image_by_region[where] == label_seps] = 0 + image_by_region[where][image_by_region[where] == label_table] = 0 return image_by_region def add_tables_heuristic_to_layout( self, image_regions_eraly_p, boxes, - slope_mean_hor, spliter_y, peaks_neg_tot, image_revised, - num_col_classifier, min_area, pixel_line): + slope_mean_hor, splitter_y, peaks_neg_tot, image_revised, + num_col_classifier, min_area, label_seps): - pixel_table =10 - image_revised_1 = self.delete_separator_around(spliter_y, peaks_neg_tot, image_revised, pixel_line, pixel_table) + label_table =10 + image_revised_1 = self.delete_separator_around(splitter_y, peaks_neg_tot, image_revised, label_seps, label_table) try: - image_revised_1[:,:30][image_revised_1[:,:30]==pixel_line] = 0 - image_revised_1[:,-30:][image_revised_1[:,-30:]==pixel_line] = 0 + image_revised_1[:,:30][image_revised_1[:,:30]==label_seps] = 0 + image_revised_1[:,-30:][image_revised_1[:,-30:]==label_seps] = 0 except: pass boxes = np.array(boxes, dtype=int) # to be on the safe side @@ -2718,7 +2708,7 @@ class Eynollah: _, thresh = cv2.threshold(image_col, 0, 255, 0) contours,hirarchy=cv2.findContours(thresh.copy(), cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE) - if indiv==pixel_table: + if indiv==label_table: main_contours = filter_contours_area_of_image_tables(thresh, contours, hirarchy, max_area=1, min_area=0.001) else: @@ -2734,11 +2724,11 @@ class Eynollah: box_xs = slice(*boxes[i][0:2]) image_box = img_comm[box_ys, box_xs] try: - image_box_tabels_1 = (image_box == pixel_table) * 1 + image_box_tabels_1 = (image_box == label_table) * 1 contours_tab,_=return_contours_of_image(image_box_tabels_1) contours_tab=filter_contours_area_of_image_tables(image_box_tabels_1,contours_tab,_,1,0.003) - image_box_tabels_1 = (image_box == pixel_line).astype(np.uint8) * 1 - image_box_tabels_and_m_text = ( (image_box == pixel_table) | + image_box_tabels_1 = (image_box == label_seps).astype(np.uint8) * 1 + image_box_tabels_and_m_text = ( (image_box == label_table) | (image_box == 1) ).astype(np.uint8) * 1 image_box_tabels_1 = cv2.dilate(image_box_tabels_1, KERNEL, iterations=5) @@ -2800,7 +2790,7 @@ class Eynollah: y_up_tabs=[] for ii in range(len(y_up_tabs)): - image_box[y_up_tabs[ii]:y_down_tabs[ii]] = pixel_table + image_box[y_up_tabs[ii]:y_down_tabs[ii]] = label_table image_revised_last[box_ys, box_xs] = image_box else: @@ -2811,14 +2801,14 @@ class Eynollah: image_revised_last[box_ys, box_xs] = image_box if num_col_classifier==1: - img_tables_col_1 = (image_revised_last == pixel_table).astype(np.uint8) + img_tables_col_1 = (image_revised_last == label_table).astype(np.uint8) contours_table_col1, _ = return_contours_of_image(img_tables_col_1) _,_ ,_ , _, y_min_tab_col1 ,y_max_tab_col1, _= find_new_features_of_contours(contours_table_col1) if len(y_min_tab_col1)>0: for ijv in range(len(y_min_tab_col1)): - image_revised_last[int(y_min_tab_col1[ijv]):int(y_max_tab_col1[ijv])] = pixel_table + image_revised_last[int(y_min_tab_col1[ijv]):int(y_max_tab_col1[ijv])] = label_table return image_revised_last def get_tables_from_model(self, img, num_col_classifier): @@ -3153,14 +3143,14 @@ class Eynollah: text_regions_p_1_n = None textline_mask_tot_d = None regions_without_separators_d = None - pixel_lines = 3 + label_seps = 3 if np.abs(slope_deskew) < SLOPE_THRESHOLD: - _, _, matrix_of_lines_ch, splitter_y_new, _ = find_number_of_columns_in_document( - text_regions_p, num_col_classifier, self.tables, pixel_lines) + _, _, matrix_of_seps_ch, splitter_y_new, _ = find_number_of_columns_in_document( + text_regions_p, num_col_classifier, self.tables, label_seps) if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - _, _, matrix_of_lines_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( - text_regions_p_1_n, num_col_classifier, self.tables, pixel_lines) + _, _, matrix_of_seps_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( + text_regions_p_1_n, num_col_classifier, self.tables, label_seps) #print(time.time()-t_0_box,'time box in 2') self.logger.info("num_col_classifier: %s", num_col_classifier) @@ -3175,7 +3165,7 @@ class Eynollah: t1 = time.time() if np.abs(slope_deskew) < SLOPE_THRESHOLD: boxes, peaks_neg_tot_tables = return_boxes_of_images_by_order_of_reading_new( - splitter_y_new, regions_without_separators, matrix_of_lines_ch, + splitter_y_new, regions_without_separators, matrix_of_seps_ch, num_col_classifier, erosion_hurts, self.tables, self.right2left) boxes_d = None self.logger.debug("len(boxes): %s", len(boxes)) @@ -3187,17 +3177,17 @@ class Eynollah: else: text_regions_p_tables = np.copy(text_regions_p) text_regions_p_tables[(table_prediction == 1)] = 10 - pixel_line = 3 + label_seps = 3 img_revised_tab2 = self.add_tables_heuristic_to_layout( text_regions_p_tables, boxes, 0, splitter_y_new, peaks_neg_tot_tables, text_regions_p_tables, - num_col_classifier , 0.000005, pixel_line) + num_col_classifier , 0.000005, label_seps) #print(time.time()-t_0_box,'time box in 3.2') img_revised_tab2, contoures_tables = self.check_iou_of_bounding_box_and_contour_for_tables( img_revised_tab2, table_prediction, 10, num_col_classifier) #print(time.time()-t_0_box,'time box in 3.3') else: boxes_d, peaks_neg_tot_tables_d = return_boxes_of_images_by_order_of_reading_new( - splitter_y_new_d, regions_without_separators_d, matrix_of_lines_ch_d, + splitter_y_new_d, regions_without_separators_d, matrix_of_seps_ch_d, num_col_classifier, erosion_hurts, self.tables, self.right2left) boxes = None self.logger.debug("len(boxes): %s", len(boxes_d)) @@ -3210,11 +3200,11 @@ class Eynollah: text_regions_p_tables = np.round(text_regions_p_tables) text_regions_p_tables[(text_regions_p_tables != 3) & (table_prediction_n == 1)] = 10 - pixel_line = 3 + label_seps = 3 img_revised_tab2 = self.add_tables_heuristic_to_layout( text_regions_p_tables, boxes_d, 0, splitter_y_new_d, peaks_neg_tot_tables_d, text_regions_p_tables, - num_col_classifier, 0.000005, pixel_line) + num_col_classifier, 0.000005, label_seps) img_revised_tab2_d,_ = self.check_iou_of_bounding_box_and_contour_for_tables( img_revised_tab2, table_prediction_n, 10, num_col_classifier) @@ -3333,14 +3323,14 @@ class Eynollah: regions_without_separators = (text_regions_p[:,:] == 1)*1 regions_without_separators[table_prediction == 1] = 1 - pixel_lines=3 + label_seps=3 if np.abs(slope_deskew) < SLOPE_THRESHOLD: num_col, _, matrix_of_lines_ch, splitter_y_new, _ = find_number_of_columns_in_document( - text_regions_p, num_col_classifier, self.tables, pixel_lines) + text_regions_p, num_col_classifier, self.tables, label_seps) if np.abs(slope_deskew) >= SLOPE_THRESHOLD: num_col_d, _, matrix_of_lines_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( - text_regions_p_1_n, num_col_classifier, self.tables, pixel_lines) + text_regions_p_1_n, num_col_classifier, self.tables, label_seps) if num_col_classifier>=3: if np.abs(slope_deskew) < SLOPE_THRESHOLD: @@ -3359,10 +3349,10 @@ class Eynollah: num_col_classifier, erosion_hurts, self.tables, self.right2left) text_regions_p_tables = np.copy(text_regions_p) text_regions_p_tables[:,:][(table_prediction[:,:]==1)] = 10 - pixel_line = 3 + label_seps = 3 img_revised_tab2 = self.add_tables_heuristic_to_layout( text_regions_p_tables, boxes, 0, splitter_y_new, peaks_neg_tot_tables, text_regions_p_tables, - num_col_classifier , 0.000005, pixel_line) + num_col_classifier , 0.000005, label_seps) img_revised_tab2,contoures_tables = self.check_iou_of_bounding_box_and_contour_for_tables( img_revised_tab2, table_prediction, 10, num_col_classifier) @@ -3374,11 +3364,11 @@ class Eynollah: text_regions_p_tables = np.round(text_regions_p_tables) text_regions_p_tables[(text_regions_p_tables != 3) & (table_prediction_n == 1)] = 10 - pixel_line = 3 + label_seps = 3 img_revised_tab2 = self.add_tables_heuristic_to_layout( text_regions_p_tables, boxes_d, 0, splitter_y_new_d, peaks_neg_tot_tables_d, text_regions_p_tables, - num_col_classifier, 0.000005, pixel_line) + num_col_classifier, 0.000005, label_seps) img_revised_tab2_d,_ = self.check_iou_of_bounding_box_and_contour_for_tables( img_revised_tab2, table_prediction_n, 10, num_col_classifier) @@ -4721,12 +4711,12 @@ class Eynollah: regions_without_separators_d = cv2.erode(regions_without_separators_d[:, :], KERNEL, iterations=6) if np.abs(slope_deskew) < SLOPE_THRESHOLD: - boxes, peaks_neg_tot_tables = return_boxes_of_images_by_order_of_reading_new( + boxes, _ = return_boxes_of_images_by_order_of_reading_new( splitter_y_new, regions_without_separators, matrix_of_lines_ch, num_col_classifier, erosion_hurts, self.tables, self.right2left, logger=self.logger) else: - boxes_d, peaks_neg_tot_tables_d = return_boxes_of_images_by_order_of_reading_new( + boxes_d, _ = return_boxes_of_images_by_order_of_reading_new( splitter_y_new_d, regions_without_separators_d, matrix_of_lines_ch_d, num_col_classifier, erosion_hurts, self.tables, self.right2left, logger=self.logger) From 3367462d181bca16316e84957299e0abb08ec0d8 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 24 Oct 2025 22:46:46 +0200 Subject: [PATCH 20/91] `return_boxes_of_images_by_order_of_reading_new`: change arg order --- src/eynollah/utils/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index f30d55e..a163fad 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -33,7 +33,7 @@ def pairwise(iterable): a = b def return_x_start_end_mothers_childs_and_type_of_reading_order( - x_min_hor_some, x_max_hor_some, cy_hor_some, peak_points, y_max_hor_some): + peak_points, x_min_hor_some, x_max_hor_some, cy_hor_some, y_max_hor_some): """ Analyse which separators overlap multiple column candidates, and how they overlap each other. @@ -54,10 +54,10 @@ def return_x_start_end_mothers_childs_and_type_of_reading_order( with no mother, specifically (and thus, no simple box separation is possible). Arguments: + * the x column coordinates * the x start column index of the raw separators * the x end column index of the raw separators * the y center coordinate of the raw separators - * the x column coordinates * the y end coordinate of the raw separators Returns: @@ -1736,7 +1736,7 @@ def return_boxes_of_images_by_order_of_reading_new( there_is_sep_with_child, \ y_mid_with_child_without_mother, x_start_with_child_without_mother, x_end_with_child_without_mother, \ new_main_sep_y = return_x_start_end_mothers_childs_and_type_of_reading_order( - x_min_hor_some, x_max_hor_some, cy_hor_some, peaks_neg_tot, y_max_hor_some) + peaks_neg_tot, x_min_hor_some, x_max_hor_some, cy_hor_some, y_max_hor_some) # show multi-column separators # dbg_plt([0, None, top, bot], "multi-column separators in current split", From 19b2c3fa424f8750e093a2fb88d7e6e381daeaab Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 24 Oct 2025 22:51:19 +0200 Subject: [PATCH 21/91] reading order: improve handling of headings and horizontal seps - drop connected components analysis to test overlaps between horizontal separators and (horizontal) neighbours (introduced in ab17a927) - instead of converting headings to topline and baseline during `find_number_of_columns_in_document` (introduced in 9f1595d7), add them to the matrix unchanged, but mark as extra type (besides horizontal and vertical separtors) - convert headings to toplines and baselines no earlier than in `return_boxes_of_images_by_order_of_reading_new` - for both headings and horizontal separators, if they already span multiple columns, check if they would overlap (horizontal) neighbours by looking at successively larger (left and right) intervals of columns (and pick the largest elongation which does not introduce any overlaps) --- src/eynollah/utils/__init__.py | 127 +++++++++++++++++++++------------ 1 file changed, 80 insertions(+), 47 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index a163fad..f3dbae2 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1387,8 +1387,6 @@ def return_points_with_boundies(peaks_neg_fin, first_point, last_point): return peaks_neg_tot def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, label_seps, contours_h=None): - ncomps, ccomps = cv2.connectedComponents(region_pre_p.astype(np.uint8)) - separators_closeup = 1 * (region_pre_p == label_seps) separators_closeup[0:110] = 0 separators_closeup[-150:] = 0 @@ -1414,14 +1412,6 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, dist_ye = max_ye - min_ye if dist_ye <= 50 and dist_xe >= 3 * dist_ye: cnts_hor_e.append(cnt) - labels = np.setdiff1d(np.unique(ccomps[med_ye]), [0]) - if len(labels) == 1: - # mid line does not intersect with any other region - # so add it as extra splitter line - cnts_hor_e.append(np.array([[[0, med_ye]], - [[ccomps.shape[1], med_ye]], - [[ccomps.shape[1], med_ye + 1]], - [[0, med_ye + 1]]])) # delete horizontal contours (leaving only the edges) separators_closeup_n_binary = cv2.fillPoly(separators_closeup_n_binary, pts=cnts_hor_e, color=0) @@ -1493,7 +1483,7 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, slope_seps_org_hor=slope_seps_org_hor[dist_x_hor>=len_x/2.0] dist_x_hor=dist_x_hor[dist_x_hor>=len_x/2.0] - matrix_of_seps_ch=np.zeros((len(cy_seps_hor)+len(cx_seps_ver),10)) + matrix_of_seps_ch = np.zeros((len(cy_seps_hor)+len(cx_seps_ver), 10), dtype=int) matrix_of_seps_ch[:len(cy_seps_hor),0]=args_hor matrix_of_seps_ch[len(cy_seps_hor):,0]=args_ver matrix_of_seps_ch[len(cy_seps_hor):,1]=cx_seps_ver @@ -1515,34 +1505,17 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, if contours_h is not None: _, dist_x_head, x_min_head, x_max_head, cy_head, _, y_min_head, y_max_head, _ = \ find_features_of_lines(contours_h) - # matrix_l_n = np.zeros((len(cy_head), matrix_of_seps_ch.shape[1])) - # args_head = np.arange(len(cy_head)) - # matrix_l_n[:, 0] = args_head - # matrix_l_n[:, 2] = x_min_head+30 - # matrix_l_n[:, 3] = x_max_head-30 - # matrix_l_n[:, 4] = dist_x_head - # matrix_l_n[:, 5] = y_min_head-3-8 - # matrix_l_n[:, 6] = y_min_head-5-8 - # matrix_l_n[:, 7] = y_max_head#y_min_head+1-8 - # matrix_l_n[:, 8] = 4 - # split at toplines (y_min_head) and baselines (y_max_head) instead of center (cy_head): - cy_head = np.stack((y_min_head, y_max_head)).T.flatten() - y_min_head, y_max_head = (np.stack((y_min_head - 2, y_max_head - 2)).T.flatten(), - np.stack((y_min_head + 2, y_max_head + 2)).T.flatten()) - x_min_head = np.repeat(x_min_head, 2) - x_max_head = np.repeat(x_max_head, 2) - dist_x_head = np.repeat(dist_x_head, 2) - matrix_l_n = np.zeros((len(cy_head), matrix_of_seps_ch.shape[1])) + matrix_l_n = np.zeros((len(cy_head), matrix_of_seps_ch.shape[1]), dtype=int) args_head = np.arange(len(cy_head)) matrix_l_n[:, 0] = args_head - # +/- 30px to avoid crossing col peaks by accident - matrix_l_n[:, 2] = x_min_head + 30 - matrix_l_n[:, 3] = x_max_head - 30 + matrix_l_n[:, 2] = x_min_head + matrix_l_n[:, 3] = x_max_head matrix_l_n[:, 4] = dist_x_head matrix_l_n[:, 5] = cy_head matrix_l_n[:, 6] = y_min_head matrix_l_n[:, 7] = y_max_head - matrix_l_n[:, 8] = 4 + matrix_l_n[:, 8] = y_max_head - y_min_head + matrix_l_n[:, 9] = 2 # mark as heading (so it can be split into 2 horizontal separators as needed) matrix_of_seps_ch = np.append( matrix_of_seps_ch, matrix_l_n, axis=0) @@ -1551,9 +1524,12 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, cy_seps_splitters = np.append(cy_seps_splitters, special_separators) if contours_h is not None: - cy_seps_splitters_head=cy_head[(x_min_head<=.16*region_pre_p.shape[1]) & - (x_max_head>=.84*region_pre_p.shape[1])] - cy_seps_splitters = np.append(cy_seps_splitters, cy_seps_splitters_head) + y_min_splitters_head = y_min_head[(x_min_head<=.16*region_pre_p.shape[1]) & + (x_max_head>=.84*region_pre_p.shape[1])] + y_max_splitters_head = y_max_head[(x_min_head<=.16*region_pre_p.shape[1]) & + (x_max_head>=.84*region_pre_p.shape[1])] + cy_seps_splitters = np.append(cy_seps_splitters, y_min_splitters_head) + cy_seps_splitters = np.append(cy_seps_splitters, y_max_splitters_head) cy_seps_splitters = np.sort(cy_seps_splitters).astype(int) splitter_y_new = [0] + list(cy_seps_splitters) + [region_pre_p.shape[0]] @@ -1713,17 +1689,6 @@ def return_boxes_of_images_by_order_of_reading_new( #num_col, peaks_neg_fin = find_num_col( # regions_without_separators[top:bot,:], # multiplier=7.0) - x_min_hor_some=matrix_new[:,2][ (matrix_new[:,9]==0) ] - x_max_hor_some=matrix_new[:,3][ (matrix_new[:,9]==0) ] - cy_hor_some=matrix_new[:,5][ (matrix_new[:,9]==0) ] - y_max_hor_some=matrix_new[:,7][ (matrix_new[:,9]==0) ] - - if right2left_readingorder: - x_max_hor_some_new = width_tot - x_min_hor_some - x_min_hor_some_new = width_tot - x_max_hor_some - x_min_hor_some =list(np.copy(x_min_hor_some_new)) - x_max_hor_some =list(np.copy(x_max_hor_some_new)) - peaks_neg_tot = np.array([0] + peaks_neg_fin + [width_tot]) #print(peaks_neg_tot,'peaks_neg_tot') peaks_neg_tot_tables.append(peaks_neg_tot) @@ -1731,6 +1696,74 @@ def return_boxes_of_images_by_order_of_reading_new( all_columns = set(range(len(peaks_neg_tot) - 1)) #print("all_columns", all_columns) + # elongate horizontal separators+headings as much as possible without overlap + args_nonver = matrix_new[:, 9] != 1 + regions_with_separators = np.copy(regions_without_separators[top:bot]) + for xmin, xmax, ymin, ymax in matrix_new[:, [2, 3, 6, 7]]: + regions_with_separators[ymin - top: ymax - top, xmin: xmax] = 6 + # def dbg_imshow(box, title): + # xmin, xmax, ymin, ymax = box + # plt.imshow(regions_with_separators, extent=[0, width_tot, bot, top]) + # plt.gca().add_patch(patches.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, + # fill=False, linewidth=1, edgecolor='r')) + # plt.title(title + " at %d:%d, %d:%d" % (ymin, ymax, xmin, xmax)) + # plt.show() + for i in np.flatnonzero(args_nonver): + xmin, xmax, ymin, ymax, typ = matrix_new[i, [2, 3, 6, 7, 9]] + cut = regions_with_separators[ymin - top: ymax - top] + # dbg_imshow([xmin, xmax, ymin, ymax], "separator %d (%s)" % (i, "heading" if typ else "horizontal")) + starting = xmin - peaks_neg_tot + min_start = np.flatnonzero(starting >= 0)[-1] # last left-of + ending = xmax - peaks_neg_tot + max_end = np.flatnonzero(ending < 0)[0] # first right-of + # skip elongation unless this is already a multi-column separator/heading: + if not max_end - min_start > 1: + continue + # is there anything left of min_start? + for j in range(min_start): + # dbg_imshow([peaks_neg_tot[j], xmin, ymin, ymax], "start of %d candidate %d" % (i, j)) + if not np.any(cut[:, peaks_neg_tot[j]: xmin]): + # print("elongated sep", i, "typ", typ, "start", xmin, "to", j, peaks_neg_tot[j]) + matrix_new[i, 2] = peaks_neg_tot[j] + 1 # elongate to start of this column + break + # is there anything right of max_end? + for j in range(len(peaks_neg_tot) - 1, max_end, -1): + # dbg_imshow([xmax, peaks_neg_tot[j], ymin, ymax], "end of %d candidate %d" % (i, j)) + if not np.any(cut[:, xmax: peaks_neg_tot[j]]): + # print("elongated sep", i, "typ", typ, "end", xmax, "to", j, peaks_neg_tot[j]) + matrix_new[i, 3] = peaks_neg_tot[j] - 1 # elongate to end of this column + break + + args_hor = matrix_new[:, 9] == 0 + x_min_hor_some = matrix_new[:, 2][args_hor] + x_max_hor_some = matrix_new[:, 3][args_hor] + y_max_hor_some = matrix_new[:, 7][args_hor] + cy_hor_some = matrix_new[:, 5][args_hor] + + args_head = matrix_new[:, 9] == 2 + x_min_hor_head = matrix_new[:, 2][args_head] + x_max_hor_head = matrix_new[:, 3][args_head] + y_min_hor_head = matrix_new[:, 6][args_head] + y_max_hor_head = matrix_new[:, 7][args_head] + cy_hor_head = matrix_new[:, 5][args_head] + + # split headings at toplines (y_min_head) and baselines (y_max_head) + # instead of merely adding their center (cy_head) as horizontal separator + # (x +/- 30px to avoid crossing col peaks by accident) + x_min_hor_some = np.append(x_min_hor_some, np.tile(x_min_hor_head + 30, 2)) + x_max_hor_some = np.append(x_max_hor_some, np.tile(x_max_hor_head - 30, 2)) + y_max_hor_some = np.append(y_max_hor_some, # baselines + np.concatenate((y_min_hor_head + 2, + y_max_hor_head + 2))) + cy_hor_some = np.append(cy_hor_some, # toplines + np.concatenate((y_min_hor_head - 2, + y_max_hor_head - 2))) + + if right2left_readingorder: + x_max_hor_some = width_tot - x_min_hor_some + x_min_hor_some = width_tot - x_max_hor_some + + reading_order_type, x_starting, x_ending, y_mid, y_max, \ y_mid_without_mother, x_start_without_mother, x_end_without_mother, \ there_is_sep_with_child, \ From 1a76ce177dba69aa711b74e6c69022e4a5ebf27f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 14 Nov 2025 02:07:20 +0100 Subject: [PATCH 22/91] do_order_of_regions: round contour centers (so we can be sure they do not fall through the "pixel cracks": bboxes are delimited by integers, and we do not want to assign contours between boxes) --- src/eynollah/eynollah.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index eee3777..35b0a37 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -2491,11 +2491,15 @@ class Eynollah: contours_only_text_parent) cx_head, cy_head, mx_head, Mx_head, my_head, My_head, mxy_head = find_new_features_of_contours( contours_only_text_parent_h) + cx_main = np.array(cx_main, dtype=int) + cy_main = np.array(cy_main, dtype=int) + cx_head = np.array(cx_head, dtype=int) + cy_head = np.array(cy_head, dtype=int) def match_boxes(only_centers: bool): arg_text_con_main = np.zeros(len(contours_only_text_parent), dtype=int) for ii in range(len(contours_only_text_parent)): - check_if_textregion_located_in_a_box = False + box_found = False for jj, box in enumerate(boxes): if ((cx_main[ii] >= box[0] and cx_main[ii] < box[1] and @@ -2506,22 +2510,23 @@ class Eynollah: my_main[ii] >= box[2] and My_main[ii] < box[3])): arg_text_con_main[ii] = jj - check_if_textregion_located_in_a_box = True - #print("main/matched", (mx_main[ii], Mx_main[ii], my_main[ii], My_main[ii]), "\tin", box, only_centers) + box_found = True + # print("main/matched ", ii, "\t", (mx_main[ii], Mx_main[ii], my_main[ii], My_main[ii]), "\tin", jj, box, only_centers) break - if not check_if_textregion_located_in_a_box: + if not box_found: dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_main[ii]], [cx_main[ii]]]), axis=0) pcontained_in_box = ((boxes[:, 2] <= cy_main[ii]) & (cy_main[ii] < boxes[:, 3]) & (boxes[:, 0] <= cx_main[ii]) & (cx_main[ii] < boxes[:, 1])) + assert pcontained_in_box.any(), (ii, cx_main[ii], cy_main[ii]) ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) arg_text_con_main[ii] = ind_min - #print("main/fallback", (mx_main[ii], Mx_main[ii], my_main[ii], My_main[ii]), "\tin", boxes[ind_min], only_centers) + # print("main/fallback ", ii, "\t", (mx_main[ii], Mx_main[ii], my_main[ii], My_main[ii]), "\tin", ind_min, boxes[ind_min], only_centers) args_contours_main = np.arange(len(contours_only_text_parent)) order_by_con_main = np.zeros_like(arg_text_con_main) arg_text_con_head = np.zeros(len(contours_only_text_parent_h), dtype=int) for ii in range(len(contours_only_text_parent_h)): - check_if_textregion_located_in_a_box = False + box_found = False for jj, box in enumerate(boxes): if ((cx_head[ii] >= box[0] and cx_head[ii] < box[1] and @@ -2532,16 +2537,17 @@ class Eynollah: my_head[ii] >= box[2] and My_head[ii] < box[3])): arg_text_con_head[ii] = jj - check_if_textregion_located_in_a_box = True - #print("head/matched", (mx_head[ii], Mx_head[ii], my_head[ii], My_head[ii]), "\tin", box, only_centers) + box_found = True + # print("head/matched ", ii, "\t", (mx_head[ii], Mx_head[ii], my_head[ii], My_head[ii]), "\tin", jj, box, only_centers) break - if not check_if_textregion_located_in_a_box: + if not box_found: dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_head[ii]], [cx_head[ii]]]), axis=0) pcontained_in_box = ((boxes[:, 2] <= cy_head[ii]) & (cy_head[ii] < boxes[:, 3]) & (boxes[:, 0] <= cx_head[ii]) & (cx_head[ii] < boxes[:, 1])) + assert pcontained_in_box.any(), (ii, cx_head[ii], cy_head[ii]) ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) arg_text_con_head[ii] = ind_min - #print("head/fallback", (mx_head[ii], Mx_head[ii], my_head[ii], My_head[ii]), "\tin", boxes[ind_min], only_centers) + # print("head/fallback ", ii, "\t", (mx_head[ii], Mx_head[ii], my_head[ii], My_head[ii]), "\tin", ind_min, boxes[ind_min], only_centers) args_contours_head = np.arange(len(contours_only_text_parent_h)) order_by_con_head = np.zeros_like(arg_text_con_head) From 95f76081d1de4611d3007ef14a342d7dbb530584 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 14 Nov 2025 02:22:39 +0100 Subject: [PATCH 23/91] rename some more identifiers: MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - `lines` → `seps` (to distinguish from textlines) - `text_regions_p_1_n` → `text_regions_p_d` (because all other deskewed variables are called like this) - `pixel` → `label` --- src/eynollah/eynollah.py | 178 +++++++++++++++++++-------------------- 1 file changed, 89 insertions(+), 89 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 35b0a37..2bdb2c7 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -2091,19 +2091,19 @@ class Eynollah: prediction_regions_org = prediction_regions_org[page_coord[0] : page_coord[1], page_coord[2] : page_coord[3]] prediction_regions_org=prediction_regions_org[:,:,0] - mask_lines_only = (prediction_regions_org[:,:] ==3)*1 + mask_seps_only = (prediction_regions_org[:,:] == 3)*1 mask_texts_only = (prediction_regions_org[:,:] ==1)*1 mask_images_only=(prediction_regions_org[:,:] ==2)*1 - polygons_seplines, hir_seplines = return_contours_of_image(mask_lines_only) + polygons_seplines, hir_seplines = return_contours_of_image(mask_seps_only) polygons_seplines = filter_contours_area_of_image( - mask_lines_only, polygons_seplines, hir_seplines, max_area=1, min_area=0.00001, dilate=1) + mask_seps_only, polygons_seplines, hir_seplines, max_area=1, min_area=0.00001, dilate=1) polygons_of_only_texts = return_contours_of_interested_region(mask_texts_only,1,0.00001) - polygons_of_only_lines = return_contours_of_interested_region(mask_lines_only,1,0.00001) + polygons_of_only_seps = return_contours_of_interested_region(mask_seps_only,1,0.00001) text_regions_p_true = np.zeros(prediction_regions_org.shape) - text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts = polygons_of_only_lines, color=(3,3,3)) + text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts = polygons_of_only_seps, color=(3,3,3)) text_regions_p_true[:,:][mask_images_only[:,:] == 1] = 2 text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts=polygons_of_only_texts, color=(1,1,1)) @@ -2282,7 +2282,7 @@ class Eynollah: img_bin = resize_image(img_bin, img_height_h, img_width_h ) prediction_regions_org=prediction_regions_org[:,:,0] - mask_lines_only = (prediction_regions_org[:,:] ==3)*1 + mask_seps_only = (prediction_regions_org[:,:] == 3)*1 mask_texts_only = (prediction_regions_org[:,:] ==1)*1 mask_texts_only = mask_texts_only.astype('uint8') @@ -2293,7 +2293,7 @@ class Eynollah: mask_texts_only = cv2.dilate(mask_texts_only, kernel=np.ones((2,2), np.uint8), iterations=1) mask_images_only=(prediction_regions_org[:,:] ==2)*1 - polygons_seplines, hir_seplines = return_contours_of_image(mask_lines_only) + polygons_seplines, hir_seplines = return_contours_of_image(mask_seps_only) test_khat = np.zeros(prediction_regions_org.shape) test_khat = cv2.fillPoly(test_khat, pts=polygons_seplines, color=(1,1,1)) @@ -2307,7 +2307,7 @@ class Eynollah: #plt.show() polygons_seplines = filter_contours_area_of_image( - mask_lines_only, polygons_seplines, hir_seplines, max_area=1, min_area=0.00001, dilate=1) + mask_seps_only, polygons_seplines, hir_seplines, max_area=1, min_area=0.00001, dilate=1) test_khat = np.zeros(prediction_regions_org.shape) test_khat = cv2.fillPoly(test_khat, pts = polygons_seplines, color=(1,1,1)) @@ -2318,10 +2318,10 @@ class Eynollah: polygons_of_only_texts = return_contours_of_interested_region(mask_texts_only,1,0.00001) ##polygons_of_only_texts = dilate_textregion_contours(polygons_of_only_texts) - polygons_of_only_lines = return_contours_of_interested_region(mask_lines_only,1,0.00001) + polygons_of_only_seps = return_contours_of_interested_region(mask_seps_only,1,0.00001) text_regions_p_true = np.zeros(prediction_regions_org.shape) - text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts=polygons_of_only_lines, color=(3,3,3)) + text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts=polygons_of_only_seps, color=(3,3,3)) text_regions_p_true[:,:][mask_images_only[:,:] == 1] = 2 text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts = polygons_of_only_texts, color=(1,1,1)) @@ -2377,7 +2377,7 @@ class Eynollah: prediction_regions_org2=resize_image(prediction_regions_org2, img_height_h, img_width_h ) mask_zeros2 = (prediction_regions_org2[:,:,0] == 0) - mask_lines2 = (prediction_regions_org2[:,:,0] == 3) + mask_seps2 = (prediction_regions_org2[:,:,0] == 3) text_sume_early = (prediction_regions_org[:,:] == 1).sum() prediction_regions_org_copy = np.copy(prediction_regions_org) prediction_regions_org_copy[(prediction_regions_org_copy[:,:]==1) & (mask_zeros2[:,:]==1)] = 0 @@ -2388,8 +2388,8 @@ class Eynollah: if not(is_image_enhanced and rate_two_models < RATIO_OF_TWO_MODEL_THRESHOLD): prediction_regions_org = np.copy(prediction_regions_org_copy) - prediction_regions_org[(mask_lines2[:,:]==1) & (prediction_regions_org[:,:]==0)]=3 - mask_lines_only=(prediction_regions_org[:,:]==3)*1 + prediction_regions_org[(mask_seps2[:,:]==1) & (prediction_regions_org[:,:]==0)]=3 + mask_seps_only=(prediction_regions_org[:,:]==3)*1 prediction_regions_org = cv2.erode(prediction_regions_org[:,:], KERNEL, iterations=2) prediction_regions_org = cv2.dilate(prediction_regions_org[:,:], KERNEL, iterations=2) @@ -2411,20 +2411,20 @@ class Eynollah: prediction_regions_org = resize_image(prediction_regions_org, img_height_h, img_width_h ) prediction_regions_org=prediction_regions_org[:,:,0] - mask_lines_only=(prediction_regions_org[:,:]==3)*1 + mask_seps_only=(prediction_regions_org[:,:]==3)*1 mask_texts_only=(prediction_regions_org[:,:]==1)*1 mask_images_only=(prediction_regions_org[:,:]==2)*1 - polygons_seplines, hir_seplines = return_contours_of_image(mask_lines_only) + polygons_seplines, hir_seplines = return_contours_of_image(mask_seps_only) polygons_seplines = filter_contours_area_of_image( - mask_lines_only, polygons_seplines, hir_seplines, max_area=1, min_area=0.00001, dilate=1) + mask_seps_only, polygons_seplines, hir_seplines, max_area=1, min_area=0.00001, dilate=1) polygons_of_only_texts = return_contours_of_interested_region(mask_texts_only, 1, 0.00001) - polygons_of_only_lines = return_contours_of_interested_region(mask_lines_only, 1, 0.00001) + polygons_of_only_seps = return_contours_of_interested_region(mask_seps_only, 1, 0.00001) text_regions_p_true = np.zeros(prediction_regions_org.shape) - text_regions_p_true = cv2.fillPoly(text_regions_p_true,pts = polygons_of_only_lines, color=(3, 3, 3)) + text_regions_p_true = cv2.fillPoly(text_regions_p_true,pts = polygons_of_only_seps, color=(3, 3, 3)) text_regions_p_true[:,:][mask_images_only[:,:] == 1] = 2 text_regions_p_true=cv2.fillPoly(text_regions_p_true,pts=polygons_of_only_texts, color=(1,1,1)) @@ -2449,7 +2449,7 @@ class Eynollah: prediction_regions_org = resize_image(prediction_regions_org, img_height_h, img_width_h ) prediction_regions_org=prediction_regions_org[:,:,0] - #mask_lines_only=(prediction_regions_org[:,:]==3)*1 + #mask_seps_only=(prediction_regions_org[:,:]==3)*1 #img = resize_image(img_org, int(img_org.shape[0]*1), int(img_org.shape[1]*1)) #prediction_regions_org = self.do_prediction(True, img, self.models["region"]) @@ -2457,19 +2457,19 @@ class Eynollah: #prediction_regions_org = prediction_regions_org[:,:,0] #prediction_regions_org[(prediction_regions_org[:,:] == 1) & (mask_zeros_y[:,:] == 1)]=0 - mask_lines_only = (prediction_regions_org == 3)*1 + mask_seps_only = (prediction_regions_org == 3)*1 mask_texts_only = (prediction_regions_org == 1)*1 mask_images_only= (prediction_regions_org == 2)*1 - polygons_seplines, hir_seplines = return_contours_of_image(mask_lines_only) + polygons_seplines, hir_seplines = return_contours_of_image(mask_seps_only) polygons_seplines = filter_contours_area_of_image( - mask_lines_only, polygons_seplines, hir_seplines, max_area=1, min_area=0.00001, dilate=1) + mask_seps_only, polygons_seplines, hir_seplines, max_area=1, min_area=0.00001, dilate=1) polygons_of_only_texts = return_contours_of_interested_region(mask_texts_only,1,0.00001) - polygons_of_only_lines = return_contours_of_interested_region(mask_lines_only,1,0.00001) + polygons_of_only_seps = return_contours_of_interested_region(mask_seps_only,1,0.00001) text_regions_p_true = np.zeros(prediction_regions_org.shape) - text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts = polygons_of_only_lines, color=(3,3,3)) + text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts = polygons_of_only_seps, color=(3,3,3)) text_regions_p_true[:,:][mask_images_only[:,:] == 1] = 2 text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts = polygons_of_only_texts, color=(1,1,1)) @@ -2952,8 +2952,8 @@ class Eynollah: mask_images = (text_regions_p_1[:, :] == 2) * 1 mask_images = mask_images.astype(np.uint8) mask_images = cv2.erode(mask_images[:, :], KERNEL, iterations=10) - mask_lines = (text_regions_p_1[:, :] == 3) * 1 - mask_lines = mask_lines.astype(np.uint8) + mask_seps = (text_regions_p_1[:, :] == 3) * 1 + mask_seps = mask_seps.astype(np.uint8) img_only_regions_with_sep = ((text_regions_p_1[:, :] != 3) & (text_regions_p_1[:, :] != 0)) * 1 img_only_regions_with_sep = img_only_regions_with_sep.astype(np.uint8) @@ -2979,7 +2979,7 @@ class Eynollah: self.logger.exception(why) num_col = None #print("inside graphics 3 ", time.time() - t_in_gr) - return (num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, + return (num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_seps, text_regions_p_1, cont_page, table_prediction, textline_mask_tot_ea, img_bin_light) def run_graphics_and_columns_without_layout(self, textline_mask_tot_ea, img_bin_light): @@ -3029,8 +3029,8 @@ class Eynollah: mask_images = (text_regions_p_1[:, :] == 2) * 1 mask_images = mask_images.astype(np.uint8) mask_images = cv2.erode(mask_images[:, :], KERNEL, iterations=10) - mask_lines = (text_regions_p_1[:, :] == 3) * 1 - mask_lines = mask_lines.astype(np.uint8) + mask_seps = (text_regions_p_1[:, :] == 3) * 1 + mask_seps = mask_seps.astype(np.uint8) img_only_regions_with_sep = ((text_regions_p_1[:, :] != 3) & (text_regions_p_1[:, :] != 0)) * 1 img_only_regions_with_sep = img_only_regions_with_sep.astype(np.uint8) @@ -3046,7 +3046,7 @@ class Eynollah: except Exception as why: self.logger.exception(why) num_col = None - return (num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, + return (num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_seps, text_regions_p_1, cont_page, table_prediction) def run_enhancement(self, light_version): @@ -3101,13 +3101,13 @@ class Eynollah: return slope_deskew def run_marginals( - self, textline_mask_tot_ea, mask_images, mask_lines, + self, textline_mask_tot_ea, mask_images, mask_seps, num_col_classifier, slope_deskew, text_regions_p_1, table_prediction): textline_mask_tot = textline_mask_tot_ea[:, :] textline_mask_tot[mask_images[:, :] == 1] = 0 - text_regions_p_1[mask_lines[:, :] == 1] = 3 + text_regions_p_1[mask_seps[:, :] == 1] = 3 text_regions_p = text_regions_p_1[:, :] text_regions_p = np.array(text_regions_p) if num_col_classifier in (1, 2): @@ -3131,12 +3131,12 @@ class Eynollah: self.logger.debug('enter run_boxes_no_full_layout') t_0_box = time.time() if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - _, textline_mask_tot_d, text_regions_p_1_n, table_prediction_n = rotation_not_90_func( + _, textline_mask_tot_d, text_regions_p_d, table_prediction_n = rotation_not_90_func( image_page, textline_mask_tot, text_regions_p, table_prediction, slope_deskew) - text_regions_p_1_n = resize_image(text_regions_p_1_n, text_regions_p.shape[0], text_regions_p.shape[1]) + text_regions_p_d = resize_image(text_regions_p_d, text_regions_p.shape[0], text_regions_p.shape[1]) textline_mask_tot_d = resize_image(textline_mask_tot_d, text_regions_p.shape[0], text_regions_p.shape[1]) table_prediction_n = resize_image(table_prediction_n, text_regions_p.shape[0], text_regions_p.shape[1]) - regions_without_separators_d = (text_regions_p_1_n[:, :] == 1) * 1 + regions_without_separators_d = (text_regions_p_d[:, :] == 1) * 1 if self.tables: regions_without_separators_d[table_prediction_n[:,:] == 1] = 1 regions_without_separators = (text_regions_p[:, :] == 1) * 1 @@ -3146,7 +3146,7 @@ class Eynollah: if self.tables: regions_without_separators[table_prediction ==1 ] = 1 if np.abs(slope_deskew) < SLOPE_THRESHOLD: - text_regions_p_1_n = None + text_regions_p_d = None textline_mask_tot_d = None regions_without_separators_d = None label_seps = 3 @@ -3156,7 +3156,7 @@ class Eynollah: if np.abs(slope_deskew) >= SLOPE_THRESHOLD: _, _, matrix_of_seps_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( - text_regions_p_1_n, num_col_classifier, self.tables, label_seps) + text_regions_p_d, num_col_classifier, self.tables, label_seps) #print(time.time()-t_0_box,'time box in 2') self.logger.info("num_col_classifier: %s", num_col_classifier) @@ -3171,7 +3171,7 @@ class Eynollah: t1 = time.time() if np.abs(slope_deskew) < SLOPE_THRESHOLD: boxes, peaks_neg_tot_tables = return_boxes_of_images_by_order_of_reading_new( - splitter_y_new, regions_without_separators, matrix_of_seps_ch, + splitter_y_new, regions_without_separators, text_regions_p, matrix_of_seps_ch, num_col_classifier, erosion_hurts, self.tables, self.right2left) boxes_d = None self.logger.debug("len(boxes): %s", len(boxes)) @@ -3193,7 +3193,7 @@ class Eynollah: #print(time.time()-t_0_box,'time box in 3.3') else: boxes_d, peaks_neg_tot_tables_d = return_boxes_of_images_by_order_of_reading_new( - splitter_y_new_d, regions_without_separators_d, matrix_of_seps_ch_d, + splitter_y_new_d, regions_without_separators_d, text_regions_p_d, matrix_of_seps_ch_d, num_col_classifier, erosion_hurts, self.tables, self.right2left) boxes = None self.logger.debug("len(boxes): %s", len(boxes_d)) @@ -3202,7 +3202,7 @@ class Eynollah: if self.light_version: pass else: - text_regions_p_tables = np.copy(text_regions_p_1_n) + text_regions_p_tables = np.copy(text_regions_p_d) text_regions_p_tables = np.round(text_regions_p_tables) text_regions_p_tables[(text_regions_p_tables != 3) & (table_prediction_n == 1)] = 10 @@ -3245,22 +3245,22 @@ class Eynollah: else: polygons_of_images = return_contours_of_interested_region(img_revised_tab, 2) - pixel_img = 4 + label_marginalia = 4 min_area_mar = 0.00001 if self.light_version: - marginal_mask = (text_regions_p[:,:]==pixel_img)*1 + marginal_mask = (text_regions_p[:,:]==label_marginalia)*1 marginal_mask = marginal_mask.astype('uint8') marginal_mask = cv2.dilate(marginal_mask, KERNEL, iterations=2) polygons_of_marginals = return_contours_of_interested_region(marginal_mask, 1, min_area_mar) else: - polygons_of_marginals = return_contours_of_interested_region(text_regions_p, pixel_img, min_area_mar) + polygons_of_marginals = return_contours_of_interested_region(text_regions_p, label_marginalia, min_area_mar) - pixel_img = 10 - contours_tables = return_contours_of_interested_region(text_regions_p, pixel_img, min_area_mar) + label_tables = 10 + contours_tables = return_contours_of_interested_region(text_regions_p, label_tables, min_area_mar) #print(time.time()-t_0_box,'time box in 5') self.logger.debug('exit run_boxes_no_full_layout') - return (polygons_of_images, img_revised_tab, text_regions_p_1_n, textline_mask_tot_d, + return (polygons_of_images, img_revised_tab, text_regions_p_d, textline_mask_tot_d, regions_without_separators_d, boxes, boxes_d, polygons_of_marginals, contours_tables) @@ -3276,13 +3276,13 @@ class Eynollah: text_regions_p[:,:][table_prediction[:,:]==1] = 10 img_revised_tab = text_regions_p[:,:] if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - _, textline_mask_tot_d, text_regions_p_1_n, table_prediction_n = \ + _, textline_mask_tot_d, text_regions_p_d, table_prediction_n = \ rotation_not_90_func(image_page, textline_mask_tot, text_regions_p, table_prediction, slope_deskew) - text_regions_p_1_n = resize_image(text_regions_p_1_n, - text_regions_p.shape[0], - text_regions_p.shape[1]) + text_regions_p_d = resize_image(text_regions_p_d, + text_regions_p.shape[0], + text_regions_p.shape[1]) textline_mask_tot_d = resize_image(textline_mask_tot_d, text_regions_p.shape[0], text_regions_p.shape[1]) @@ -3290,10 +3290,10 @@ class Eynollah: text_regions_p.shape[0], text_regions_p.shape[1]) - regions_without_separators_d = (text_regions_p_1_n[:,:] == 1)*1 + regions_without_separators_d = (text_regions_p_d[:,:] == 1)*1 regions_without_separators_d[table_prediction_n[:,:] == 1] = 1 else: - text_regions_p_1_n = None + text_regions_p_d = None textline_mask_tot_d = None regions_without_separators_d = None # regions_without_separators = ( text_regions_p[:,:]==1 | text_regions_p[:,:]==2 )*1 @@ -3303,13 +3303,13 @@ class Eynollah: else: if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - _, textline_mask_tot_d, text_regions_p_1_n, table_prediction_n = \ + _, textline_mask_tot_d, text_regions_p_d, table_prediction_n = \ rotation_not_90_func(image_page, textline_mask_tot, text_regions_p, table_prediction, slope_deskew) - text_regions_p_1_n = resize_image(text_regions_p_1_n, - text_regions_p.shape[0], - text_regions_p.shape[1]) + text_regions_p_d = resize_image(text_regions_p_d, + text_regions_p.shape[0], + text_regions_p.shape[1]) textline_mask_tot_d = resize_image(textline_mask_tot_d, text_regions_p.shape[0], text_regions_p.shape[1]) @@ -3317,10 +3317,10 @@ class Eynollah: text_regions_p.shape[0], text_regions_p.shape[1]) - regions_without_separators_d = (text_regions_p_1_n[:,:] == 1)*1 + regions_without_separators_d = (text_regions_p_d[:,:] == 1)*1 regions_without_separators_d[table_prediction_n[:,:] == 1] = 1 else: - text_regions_p_1_n = None + text_regions_p_d = None textline_mask_tot_d = None regions_without_separators_d = None @@ -3331,12 +3331,12 @@ class Eynollah: label_seps=3 if np.abs(slope_deskew) < SLOPE_THRESHOLD: - num_col, _, matrix_of_lines_ch, splitter_y_new, _ = find_number_of_columns_in_document( + num_col, _, matrix_of_seps_ch, splitter_y_new, _ = find_number_of_columns_in_document( text_regions_p, num_col_classifier, self.tables, label_seps) if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - num_col_d, _, matrix_of_lines_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( - text_regions_p_1_n, num_col_classifier, self.tables, label_seps) + num_col_d, _, matrix_of_seps_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( + text_regions_p_d, num_col_classifier, self.tables, label_seps) if num_col_classifier>=3: if np.abs(slope_deskew) < SLOPE_THRESHOLD: @@ -3351,7 +3351,7 @@ class Eynollah: if np.abs(slope_deskew) < SLOPE_THRESHOLD: boxes, peaks_neg_tot_tables = return_boxes_of_images_by_order_of_reading_new( - splitter_y_new, regions_without_separators, matrix_of_lines_ch, + splitter_y_new, regions_without_separators, text_regions_p, matrix_of_seps_ch, num_col_classifier, erosion_hurts, self.tables, self.right2left) text_regions_p_tables = np.copy(text_regions_p) text_regions_p_tables[:,:][(table_prediction[:,:]==1)] = 10 @@ -3364,9 +3364,9 @@ class Eynollah: img_revised_tab2, table_prediction, 10, num_col_classifier) else: boxes_d, peaks_neg_tot_tables_d = return_boxes_of_images_by_order_of_reading_new( - splitter_y_new_d, regions_without_separators_d, matrix_of_lines_ch_d, + splitter_y_new_d, regions_without_separators_d, text_regions_p_d, matrix_of_seps_ch_d, num_col_classifier, erosion_hurts, self.tables, self.right2left) - text_regions_p_tables = np.copy(text_regions_p_1_n) + text_regions_p_tables = np.copy(text_regions_p_d) text_regions_p_tables = np.round(text_regions_p_tables) text_regions_p_tables[(text_regions_p_tables != 3) & (table_prediction_n == 1)] = 10 @@ -3399,20 +3399,20 @@ class Eynollah: text_regions_p[img_revised_tab == 10] = 10 #img_revised_tab[img_revised_tab2 == 10] = 10 - pixel_img = 4 + label_marginalia = 4 min_area_mar = 0.00001 if self.light_version: - marginal_mask = (text_regions_p[:,:]==pixel_img)*1 + marginal_mask = (text_regions_p[:,:]==label_marginalia)*1 marginal_mask = marginal_mask.astype('uint8') marginal_mask = cv2.dilate(marginal_mask, KERNEL, iterations=2) polygons_of_marginals = return_contours_of_interested_region(marginal_mask, 1, min_area_mar) else: - polygons_of_marginals = return_contours_of_interested_region(text_regions_p, pixel_img, min_area_mar) + polygons_of_marginals = return_contours_of_interested_region(text_regions_p, label_marginalia, min_area_mar) - pixel_img = 10 - contours_tables = return_contours_of_interested_region(text_regions_p, pixel_img, min_area_mar) + label_tables = 10 + contours_tables = return_contours_of_interested_region(text_regions_p, label_tables, min_area_mar) # set first model with second model text_regions_p[:, :][text_regions_p[:, :] == 2] = 5 @@ -3465,16 +3465,16 @@ class Eynollah: #plt.show() ####if not self.tables: if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - _, textline_mask_tot_d, text_regions_p_1_n, regions_fully_n = rotation_not_90_func_full_layout( + _, textline_mask_tot_d, text_regions_p_d, regions_fully_n = rotation_not_90_func_full_layout( image_page, textline_mask_tot, text_regions_p, regions_fully, slope_deskew) - text_regions_p_1_n = resize_image(text_regions_p_1_n, text_regions_p.shape[0], text_regions_p.shape[1]) + text_regions_p_d = resize_image(text_regions_p_d, text_regions_p.shape[0], text_regions_p.shape[1]) textline_mask_tot_d = resize_image(textline_mask_tot_d, text_regions_p.shape[0], text_regions_p.shape[1]) regions_fully_n = resize_image(regions_fully_n, text_regions_p.shape[0], text_regions_p.shape[1]) if not self.tables: - regions_without_separators_d = (text_regions_p_1_n[:, :] == 1) * 1 + regions_without_separators_d = (text_regions_p_d[:, :] == 1) * 1 else: - text_regions_p_1_n = None + text_regions_p_d = None textline_mask_tot_d = None regions_without_separators_d = None if not self.tables: @@ -3484,7 +3484,7 @@ class Eynollah: self.logger.debug('exit run_boxes_full_layout') #print("full inside 3", time.time()- t_full0) - return (polygons_of_images, img_revised_tab, text_regions_p_1_n, textline_mask_tot_d, + return (polygons_of_images, img_revised_tab, text_regions_p_d, textline_mask_tot_d, regions_without_separators_d, regions_fully, regions_without_separators, polygons_of_marginals, contours_tables) @@ -4301,7 +4301,7 @@ class Eynollah: slope_deskew = self.run_deskew(textline_mask_tot_ea) #print("text region early -2,5 in %.1fs", time.time() - t0) #self.logger.info("Textregion detection took %.1fs ", time.time() - t1t) - num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, \ + num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_seps, \ text_regions_p_1, cont_page, table_prediction, textline_mask_tot_ea, img_bin_light = \ self.run_graphics_and_columns_light(text_regions_p_1, textline_mask_tot_ea, num_col_classifier, num_column_is_classified, @@ -4318,7 +4318,7 @@ class Eynollah: confidence_matrix = np.zeros((text_regions_p_1.shape[:2])) t1 = time.time() - num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, \ + num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_seps, \ text_regions_p_1, cont_page, table_prediction = \ self.run_graphics_and_columns(text_regions_p_1, num_col_classifier, num_column_is_classified, erosion_hurts) @@ -4356,12 +4356,12 @@ class Eynollah: image_page = resize_image(image_page,img_h_new, img_w_new ) textline_mask_tot_ea = resize_image(textline_mask_tot_ea,img_h_new, img_w_new ) mask_images = resize_image(mask_images,img_h_new, img_w_new ) - mask_lines = resize_image(mask_lines,img_h_new, img_w_new ) + mask_seps = resize_image(mask_seps, img_h_new, img_w_new) text_regions_p_1 = resize_image(text_regions_p_1,img_h_new, img_w_new ) table_prediction = resize_image(table_prediction,img_h_new, img_w_new ) textline_mask_tot, text_regions_p = \ - self.run_marginals(textline_mask_tot_ea, mask_images, mask_lines, + self.run_marginals(textline_mask_tot_ea, mask_images, mask_seps, num_col_classifier, slope_deskew, text_regions_p_1, table_prediction) if self.plotter: self.plotter.save_plot_of_layout_main_all(text_regions_p, image_page) @@ -4398,14 +4398,14 @@ class Eynollah: ## birdan sora chock chakir t1 = time.time() if not self.full_layout: - polygons_of_images, img_revised_tab, text_regions_p_1_n, \ + polygons_of_images, img_revised_tab, text_regions_p_d, \ textline_mask_tot_d, regions_without_separators_d, \ boxes, boxes_d, polygons_of_marginals, contours_tables = \ self.run_boxes_no_full_layout(image_page, textline_mask_tot, text_regions_p, slope_deskew, num_col_classifier, table_prediction, erosion_hurts) ###polygons_of_marginals = dilate_textregion_contours(polygons_of_marginals) else: - polygons_of_images, img_revised_tab, text_regions_p_1_n, \ + polygons_of_images, img_revised_tab, text_regions_p_d, \ textline_mask_tot_d, regions_without_separators_d, \ regions_fully, regions_without_separators, polygons_of_marginals, contours_tables = \ self.run_boxes_full_layout(image_page, textline_mask_tot, text_regions_p, slope_deskew, @@ -4419,7 +4419,7 @@ class Eynollah: text_only = (img_revised_tab[:, :] == 1) * 1 if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - text_only_d = (text_regions_p_1_n[:, :] == 1) * 1 + text_only_d = ((text_regions_p_d[:, :] == 1)) * 1 #print("text region early 2 in %.1fs", time.time() - t0) ###min_con_area = 0.000005 @@ -4695,18 +4695,18 @@ class Eynollah: label_seps = 6 if not self.headers_off: if np.abs(slope_deskew) < SLOPE_THRESHOLD: - num_col, _, matrix_of_lines_ch, splitter_y_new, _ = find_number_of_columns_in_document( + num_col, _, matrix_of_seps_ch, splitter_y_new, _ = find_number_of_columns_in_document( text_regions_p, num_col_classifier, self.tables, label_seps, contours_only_text_parent_h) else: - _, _, matrix_of_lines_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( - text_regions_p_1_n, num_col_classifier, self.tables, label_seps, contours_only_text_parent_h_d_ordered) + _, _, matrix_of_seps_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( + text_regions_p_d, num_col_classifier, self.tables, label_seps, contours_only_text_parent_h_d_ordered) elif self.headers_off: if np.abs(slope_deskew) < SLOPE_THRESHOLD: - num_col, _, matrix_of_lines_ch, splitter_y_new, _ = find_number_of_columns_in_document( + num_col, _, matrix_of_seps_ch, splitter_y_new, _ = find_number_of_columns_in_document( text_regions_p, num_col_classifier, self.tables, label_seps) else: - _, _, matrix_of_lines_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( - text_regions_p_1_n, num_col_classifier, self.tables, label_seps) + _, _, matrix_of_seps_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( + text_regions_p_d, num_col_classifier, self.tables, label_seps) if num_col_classifier >= 3: if np.abs(slope_deskew) < SLOPE_THRESHOLD: @@ -4718,12 +4718,12 @@ class Eynollah: if np.abs(slope_deskew) < SLOPE_THRESHOLD: boxes, _ = return_boxes_of_images_by_order_of_reading_new( - splitter_y_new, regions_without_separators, matrix_of_lines_ch, + splitter_y_new, regions_without_separators, text_regions_p, matrix_of_seps_ch, num_col_classifier, erosion_hurts, self.tables, self.right2left, logger=self.logger) else: boxes_d, _ = return_boxes_of_images_by_order_of_reading_new( - splitter_y_new_d, regions_without_separators_d, matrix_of_lines_ch_d, + splitter_y_new_d, regions_without_separators_d, text_regions_p_d, matrix_of_seps_ch_d, num_col_classifier, erosion_hurts, self.tables, self.right2left, logger=self.logger) else: From 4abc2ff57249e634c70cda665abc5d99429595d2 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 14 Nov 2025 03:05:02 +0100 Subject: [PATCH 24/91] rewrite/simplify manual reading order using recursive algorithm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - rename `return_x_start_end_mothers_childs_and_type_of_reading_order` → `return_multicol_separators_x_start_end`, and drop all the analysis pertaining to mother/child relationships and full-span separators, also drop the separator unification rules; instead of the latter, try to combine neighbouring separators more generally: join column spans iff there is nothing in between (which also necessitates passing the region mask), and keep only one of every such redundant pair; add the top (of each page part) as full-span separator up front, and return separators already ordered by y - `return_boxes_of_images_by_order_of_reading_new`: - also pass regions with separators, so they do not have to be reconstructed from the separator coordinates, and also contain images and other non-text region types, when trying to elongate separators to maximize their span (without introducing overlaps) - determine connected components of the region mask, i.e. labels and their respective bboxes, in order to 1. gain additional multi-column separators, if possible 2. avoid cutting through regions which do cross column boundaries later on - whenever adding a new bbox, first look up the label map to see if there are any multi-column regions extending to the right of the current column; if there are, then advance not just one column to the right, but as many as necessary to avoid cutting through these regions - new core algorithm: iterate separators sorted by y and then column by column, but whenever the next separator ends in the same column as the current one or even further left, recurse (i.e. finish that span first before continuing with the top iteration) --- src/eynollah/utils/__init__.py | 935 ++++++++++----------------------- 1 file changed, 277 insertions(+), 658 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index f3dbae2..e00004f 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -32,289 +32,132 @@ def pairwise(iterable): yield a, b a = b -def return_x_start_end_mothers_childs_and_type_of_reading_order( - peak_points, x_min_hor_some, x_max_hor_some, cy_hor_some, y_max_hor_some): +def return_multicol_separators_x_start_end( + regions_without_separators, peak_points, top, bot, + x_min_hor_some, x_max_hor_some, cy_hor_some, y_min_hor_some, y_max_hor_some): """ Analyse which separators overlap multiple column candidates, and how they overlap each other. Ignore separators not spanning multiple columns. - For the separators to be returned, try to join them when they are directly - adjacent horizontally but nearby vertically (and thus mutually compatible). - Also, mark any separators that already span the full width. - - Furthermore, identify which pairs of (unjoined) separators span subsets of columns - of each other (disregarding vertical positions). Referring, respectively, to the - superset separators as "mothers" and to the subset separators as "children", - retrieve information on which columns are spanned by separators with no mother, - and which columns are spanned by their children (if any). - - Moreover, determine if there is any (column) overlap among the multi-span separators - with no mother, specifically (and thus, no simple box separation is possible). + For the separators to be returned, try to remove or unify them when there + is no region between them (vertically) and their neighbours. Arguments: + * the text mask (with all separators suppressed) * the x column coordinates - * the x start column index of the raw separators - * the x end column index of the raw separators - * the y center coordinate of the raw separators - * the y end coordinate of the raw separators + * the y start coordinate to consider in total + * the y end coordinate to consider in total + * the x start coordinate of the horizontal separators + * the x end coordinate of the horizontal separators + * the y start coordinate of the horizontal separators + * the y center coordinate of the horizontal separators + * the y end coordinate of the horizontal separators Returns: a tuple of: - * whether any top-level (no-mother) multi-span separators overlap each other * the x start column index of the resulting multi-span separators * the x end column index of the resulting multi-span separators + * the y start coordinate of the resulting multi-span separators * the y center coordinate of the resulting multi-span separators * the y end coordinate of the resulting multi-span separators - * the y center (for 1 representative) of the top-level (no-mother) multi-span separators - * the x start column index of the top-level (no-mother) multi-span separators - * the x end column index of the top-level (no-mother) multi-span separators - * whether any multi-span separators have super-spans of other (child) multi-span separators - * the y center (for 1 representative) of the top-level (no-mother) multi-span separators - which have super-spans of other (child) multi-span separators - * the x start column index of the top-level multi-span separators - which have super-spans of other (child) multi-span separators - * the x end column index of the top-level multi-span separators - which have super-spans of other (child) multi-span separators - * indexes of multi-span separators with full-width span """ - x_start=[] - x_end=[] - len_sep=[] - y_mid=[] - y_max=[] - new_main_sep_y=[] - indexer=0 + x_start = [0] + x_end = [len(peak_points) - 1] + y_min = [top] + y_mid = [top] + y_max = [top + 2] + indexer = 1 for i in range(len(x_min_hor_some)): #print(indexer, "%d:%d" % (x_min_hor_some[i], x_max_hor_some[i]), cy_hor_some[i]) starting = x_min_hor_some[i] - peak_points min_start = np.flatnonzero(starting >= 0)[-1] # last left-of ending = x_max_hor_some[i] - peak_points - max_end = np.flatnonzero(ending < 0)[0] # first right-of + max_end = np.flatnonzero(ending <= 0)[0] # first right-of #print(indexer, "%d:%d" % (min_start, max_end)) if (max_end-min_start)>=2: # column range of separator spans more than one column candidate - if (max_end-min_start)==(len(peak_points)-1): - # all columns (i.e. could be true new y splitter) - new_main_sep_y.append(indexer) - #print((max_end-min_start),len(peak_points),'(max_end-min_start)') + y_min.append(y_min_hor_some[i]) y_mid.append(cy_hor_some[i]) y_max.append(y_max_hor_some[i]) x_end.append(max_end) x_start.append(min_start) - len_sep.append(max_end-min_start) indexer+=1 #print(x_start,'x_start') #print(x_end,'x_end') - x_start_returned = np.array(x_start, dtype=int) - x_end_returned = np.array(x_end, dtype=int) - y_mid_returned = np.array(y_mid, dtype=int) - y_max_returned = np.array(y_max, dtype=int) - #print(y_mid_returned,'y_mid_returned') - #print(x_start_returned,'x_start_returned') - #print(x_end_returned,'x_end_returned') - - # join/elongate separators if follow-up x and similar y - sep_pairs = contours_in_same_horizon(y_mid_returned) - if len(sep_pairs): - #print('burda') - args_to_be_unified = set() - y_mid_unified = [] - y_max_unified = [] - x_start_unified = [] - x_end_unified = [] - for pair in sep_pairs: - if (not np.array_equal(*x_start_returned[pair]) and - not np.array_equal(*x_end_returned[pair]) and - # immediately adjacent columns? - np.diff(x_end_returned[pair] - - x_start_returned[pair])[0] in [1, -1]): - - args_to_be_unified.union(set(pair)) - y_mid_unified.append(np.min(y_mid_returned[pair])) - y_max_unified.append(np.max(y_max_returned[pair])) - x_start_unified.append(np.min(x_start_returned[pair])) - x_end_unified.append(np.max(x_end_returned[pair])) - #print(pair,'pair') - #print(x_start_returned[pair],'x_s_same_hor') - #print(x_end_returned[pair],'x_e_same_hor') - #print(y_mid_unified,'y_mid_unified') - #print(y_max_unified,'y_max_unified') - #print(x_start_unified,'x_s_unified') - #print(x_end_unified,'x_e_selected') - #print('#############################') - - if len(y_mid_unified): - args_lines_not_unified = np.setdiff1d(np.arange(len(y_mid_returned)), - list(args_to_be_unified), assume_unique=True) - #print(args_lines_not_unified,'args_lines_not_unified') - x_start_returned = np.append(x_start_returned[args_lines_not_unified], - x_start_unified, axis=0) - x_end_returned = np.append(x_end_returned[args_lines_not_unified], - x_end_unified, axis=0) - y_mid_returned = np.append(y_mid_returned[args_lines_not_unified], - y_mid_unified, axis=0) - y_max_returned = np.append(y_max_returned[args_lines_not_unified], - y_max_unified, axis=0) - #print(y_mid_returned,'y_mid_returned2') - #print(x_start_returned,'x_start_returned2') - #print(x_end_returned,'x_end_returned2') - - #print(new_main_sep_y,'new_main_sep_y') - #print(x_start,'x_start') - #print(x_end,'x_end') - x_start = np.array(x_start) - x_end = np.array(x_end) - y_mid = np.array(y_mid) - if len(new_main_sep_y): - # some full-width multi-span separators exist, so - # restrict the y range of separators to search for - # mutual overlaps to only those within the largest - # y strip between adjacent multi-span separators - # that involve at least one such full-width seps. - # (does not affect the separators to be returned) - min_ys=np.min(y_mid) - max_ys=np.max(y_mid) - #print(min_ys,'min_ys') - #print(max_ys,'max_ys') - - y_mains0 = list(y_mid[new_main_sep_y]) - y_mains = [min_ys] + y_mains0 + [max_ys] - - y_mains = np.sort(y_mains) - argm = np.argmax(np.diff(y_mains)) - y_mid_new = y_mains[argm] - y_mid_next_new = y_mains[argm + 1] - - #print(y_mid_new,argm,'y_mid_new') - #print(y_mid_next_new,argm+1,'y_mid_next_new') - #print(y_mid[new_main_sep_y],new_main_sep_y,'yseps') - x_start=np.array(x_start) - x_end=np.array(x_end) - y_mid=np.array(y_mid) - # iff either boundary is itself not a full-width separator, - # then include it in the range of separators to be kept - if y_mid_new in y_mains0: - where = y_mid > y_mid_new - else: - where = y_mid >= y_mid_new - if y_mid_next_new in y_mains0: - where &= y_mid < y_mid_next_new - else: - where &= y_mid <= y_mid_next_new - x_start = x_start[where] - x_end = x_end[where] - y_mid = y_mid[where] + x_start = np.array(x_start, dtype=int) + x_end = np.array(x_end, dtype=int) + y_min = np.array(y_min, dtype=int) + y_mid = np.array(y_mid, dtype=int) + y_max = np.array(y_max, dtype=int) + #print(y_mid,'y_mid') #print(x_start,'x_start') #print(x_end,'x_end') - # remove redundant separators that span the same columns - # (keeping only 1 representative each) - deleted = set() - for index_i in range(len(x_start) - 1): - nodes_i = set(range(x_start[index_i], x_end[index_i] + 1)) - #print(nodes_i, "nodes_i") - for index_j in range(index_i + 1, len(x_start)): - nodes_j = set(range(x_start[index_j], x_end[index_j] + 1)) - #print(nodes_j, "nodes_j") - if nodes_i == nodes_j: - deleted.add(index_j) - #print(deleted,"deleted") - remained_sep_indexes = set(range(len(x_start))) - deleted - #print(remained_sep_indexes,'remained_sep_indexes') + # remove redundant separators (with nothing in between) + args_emptysep = set() + args_ysorted = np.argsort(y_mid) + for i in range(len(y_mid)): + # find nearest neighbours above with nothing in between + prev = (~np.eye(len(y_mid), dtype=bool)[i] & + (y_mid[i] >= y_mid) & + # complete subsumption: + # (x_start[i] >= x_start) & + # (x_end[i] <= x_end) + # partial overlap + (x_start[i] < x_end) & + (x_end[i] > x_start) + ) + prev[list(args_emptysep)] = False # but no pair we already saw + if not prev.any(): + continue + prev = np.flatnonzero(prev[args_ysorted]) + j = args_ysorted[prev[-1]] + if not np.any(regions_without_separators[y_max[j]: y_min[i], + peak_points[min(x_start[i], x_start[j])]: + peak_points[max(x_end[i], x_end[j])]]): + args_emptysep.add(i) + if x_start[j] > x_start[i]: + # print(j, "now starts at", x_start[i]) + x_start[j] = x_start[i] + if x_end[j] < x_end[i]: + x_end[j] = x_end[i] + # print(j, "now ends at", x_end[i]) + # print(j, i, "%d:%d" % (y_mid[j], y_mid[i]), "%d:%d" % (x_start[i], x_end[i]), "empty prev sep") + continue + # find nearest neighbours below with nothing in between + nExt = (~np.eye(len(y_mid), dtype=bool)[i] & + (y_mid[i] <= y_mid) & + (x_start[i] >= x_start) & + (x_end[i] <= x_end)) + nExt[list(args_emptysep)] = False # but no pair we already saw + if not nExt.any(): + continue + nExt = np.flatnonzero(nExt[args_ysorted]) + j = args_ysorted[nExt[0]] + if not np.any(regions_without_separators[y_max[i]: y_min[j], + peak_points[x_start[i]]: + peak_points[x_end[i]]]): + args_emptysep.add(i) + # print(j, i, "%d:%d" % (y_mid[j], y_mid[i]), "%d:%d" % (x_start[i], x_end[i]), "empty next sep") + args_to_be_kept = [arg for arg in args_ysorted + if not arg in args_emptysep] + x_start = x_start[args_to_be_kept] + x_end = x_end[args_to_be_kept] + y_min = y_min[args_to_be_kept] + y_mid = y_mid[args_to_be_kept] + y_max = y_max[args_to_be_kept] - # determine which separators span which columns - mother = [] # whether the respective separator has a mother separator - child = [] # whether the respective separator has a child separator - for index_i in remained_sep_indexes: - have_mother=0 - have_child=0 - nodes_i = set(range(x_start[index_i], x_end[index_i] + 1)) - for index_j in remained_sep_indexes: - nodes_j = set(range(x_start[index_j], x_end[index_j] + 1)) - if nodes_i < nodes_j: - have_mother=1 - if nodes_i > nodes_j: - have_child=1 - mother.append(have_mother) - child.append(have_child) - #print(mother, "mother") - #print(child, "child") - - mother = np.array(mother) - child = np.array(child) - #print(mother,'mother') - #print(child,'child') - remained_sep_indexes = np.array(list(remained_sep_indexes)) - #print(len(remained_sep_indexes)) - #print(len(remained_sep_indexes),len(x_start),len(x_end),len(y_mid),'lens') - - reading_order_type = 0 - if len(remained_sep_indexes): - #print(np.array(remained_sep_indexes),'np.array(remained_sep_indexes)') - #print(np.array(mother),'mother') - remained_sep_indexes_without_mother = remained_sep_indexes[mother==0] - remained_sep_indexes_with_child_without_mother = remained_sep_indexes[(mother==0) & (child==1)] - #print(remained_sep_indexes_without_mother,'remained_sep_indexes_without_mother') - #print(remained_sep_indexes_without_mother,'remained_sep_indexes_without_mother') - - x_end_with_child_without_mother = x_end[remained_sep_indexes_with_child_without_mother] - x_start_with_child_without_mother = x_start[remained_sep_indexes_with_child_without_mother] - y_mid_with_child_without_mother = y_mid[remained_sep_indexes_with_child_without_mother] - - x_end_without_mother = x_end[remained_sep_indexes_without_mother] - x_start_without_mother = x_start[remained_sep_indexes_without_mother] - y_mid_without_mother = y_mid[remained_sep_indexes_without_mother] - - if len(remained_sep_indexes_without_mother)>=2: - for i in range(len(remained_sep_indexes_without_mother)-1): - index_i = remained_sep_indexes_without_mother[i] - nodes_i = set(range(x_start[index_i], x_end[index_i] + 1)) - #print(index_i, nodes_i, "nodes_i without mother") - for j in range(i + 1, len(remained_sep_indexes_without_mother)): - index_j = remained_sep_indexes_without_mother[j] - nodes_j = set(range(x_start[index_j], x_end[index_j] + 1)) - #print(index_j, nodes_j, "nodes_j without mother") - if nodes_i - nodes_j != nodes_i: - #print("type=1") - reading_order_type = 1 - else: - y_mid_without_mother = np.zeros(0, int) - x_start_without_mother = np.zeros(0, int) - x_end_without_mother = np.zeros(0, int) - y_mid_with_child_without_mother = np.zeros(0, int) - x_start_with_child_without_mother = np.zeros(0, int) - x_end_with_child_without_mother = np.zeros(0, int) - - #print(reading_order_type,'reading_order_type') - #print(y_mid_with_child_without_mother,'y_mid_with_child_without_mother') - #print(x_start_with_child_without_mother,'x_start_with_child_without_mother') - #print(x_end_with_child_without_mother,'x_end_with_hild_without_mother') - - len_sep_with_child = len(child[child==1]) - #print(len_sep_with_child,'len_sep_with_child') - there_is_sep_with_child = 0 - if len_sep_with_child >= 1: - there_is_sep_with_child = 1 - - return (reading_order_type, - x_start_returned, - x_end_returned, - y_mid_returned, - y_max_returned, - y_mid_without_mother, - x_start_without_mother, - x_end_without_mother, - there_is_sep_with_child, - y_mid_with_child_without_mother, - x_start_with_child_without_mother, - x_end_with_child_without_mother, - new_main_sep_y) + return (x_start, + x_end, + y_min, + y_mid, + y_max) def box2rect(box: Tuple[int, int, int, int]) -> Tuple[int, int, int, int]: return (box[1], box[1] + box[3], @@ -1212,6 +1055,25 @@ def small_textlines_to_parent_adherence2(textlines_con, textline_iamge, num_col) return textlines_con_changed def order_of_regions(textline_mask, contours_main, contours_head, y_ref, x_ref): + """ + Order text region contours within a single column bbox in a top-down-left-right way. + + First, determine the vertical gaps. Then iterate over each vertical segment, + identifying the contours centered in that segment. Order them by their + horizontal center, and add them to the overall order. + + Arguments: + * textline_mask: the mask of the textline segmentation, cropped for that box + * contours_main: the paragraph text region contours expected to be here + * contours_head: the heading text region contours expected to be here + * y_ref: the vertical offset of that box within the page + * x_ref: the horizontal offset of that box within the page + + Returns: a tuple of + * the array of contour indexes overall within this box (i.e. into main+head) + * the array of types (1 for paragraph, 2 for heading) + * the array of contour indexes for the respective type (i.e. into contours_main or contours_head) + """ ##plt.imshow(textline_mask) ##plt.show() y = textline_mask.sum(axis=1) # horizontal projection profile @@ -1547,7 +1409,7 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, try: num_col, peaks_neg_fin = find_num_col(regions_without_separators[top: bot], num_col_classifier, tables, multiplier=7.0) - #print("big part %d:%d has %d columns" % (top, bot, num_col), peaks_neg_fin) + # print("big part %d:%d has %d columns" % (top, bot, num_col + 1), peaks_neg_fin) except: num_col = 0 peaks_neg_fin = [] @@ -1564,11 +1426,36 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, return num_col_fin, peaks_neg_fin_fin, matrix_of_seps_ch, splitter_y_new, separators_closeup_n def return_boxes_of_images_by_order_of_reading_new( - splitter_y_new, regions_without_separators, - matrix_of_lines_ch, + splitter_y_new, + regions_without_separators, + regions_with_separators, + matrix_of_seps_ch, num_col_classifier, erosion_hurts, tables, right2left_readingorder, logger=None): + """ + Iterate through the vertical parts of a page, each with its own set of columns, + and from the matrix of horizontal separators for that part, find an ordered + list of bounding boxes through all columns and regions. + + Arguments: + * splitter_y_new: the y coordinates separating the parts + * regions_without_separators: (text) region mask with separators suppressed; + (needed to find per-part columns and to combine separators if possible) + * regions_with_separators: (full) region map with separators suppressed; + (needed to elongate separators if possible) + * matrix_of_seps: type and coordinates of horizontal and vertical separators, + as well as headings + * num_col_classifier: predicted number of columns for the entire page + * erosion_hurts: bool + * tables: bool + * right2left_readingorder: whether to invert the default left-to-right order + + Returns: a tuple of + * the ordered list of bounding boxes + * a list of arrays: the x coordinates delimiting the columns for every page part + (according to splitter) + """ if right2left_readingorder: regions_without_separators = cv2.flip(regions_without_separators,1) @@ -1576,12 +1463,20 @@ def return_boxes_of_images_by_order_of_reading_new( logger = getLogger(__package__) logger.debug('enter return_boxes_of_images_by_order_of_reading_new') + # def dbg_imshow(box, title): + # xmin, xmax, ymin, ymax = box + # plt.imshow(regions_with_separators) #, extent=[0, width_tot, bot, top]) + # plt.gca().add_patch(patches.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, + # fill=False, linewidth=1, edgecolor='r')) + # plt.title(title + " at %d:%d, %d:%d" % (ymin, ymax, xmin, xmax)) + # plt.show() # def dbg_plt(box=None, title=None, rectangles=None, rectangles_showidx=False): # minx, maxx, miny, maxy = box or (0, None, 0, None) # img = regions_without_separators[miny:maxy, minx:maxx] # plt.imshow(img) - # xrange = np.arange(0, img.shape[1], 100) - # yrange = np.arange(0, img.shape[0], 100) + # step = max(img.shape) // 10 + # xrange = np.arange(0, img.shape[1], step) + # yrange = np.arange(0, img.shape[0], step) # ax = plt.gca() # ax.set_xticks(xrange) # ax.set_yticks(yrange) @@ -1597,7 +1492,7 @@ def return_boxes_of_images_by_order_of_reading_new( # ax.add_patch(patches.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, # fill=False, linewidth=1, edgecolor='r')) # if rectangles_showidx: - # ax.text((xmin+xmax)/2, (ymin+ymax)/2, str(i + 1), c='r') + # ax.text((xmin+xmax)/2, (ymin+ymax)/2, str(i), c='r') # plt.show() # dbg_plt(title="return_boxes_of_images_by_order_of_reading_new") @@ -1606,11 +1501,12 @@ def return_boxes_of_images_by_order_of_reading_new( splitter_y_new = np.array(splitter_y_new, dtype=int) height_tot, width_tot = regions_without_separators.shape big_part = 22 * height_tot // 100 # percent height + _, ccomps, cstats, _ = cv2.connectedComponentsWithStats(regions_without_separators.astype(np.uint8)) for top, bot in pairwise(splitter_y_new): # print("%d:%d" % (top, bot), 'i') # dbg_plt([0, None, top, bot], "image cut for y split %d:%d" % (top, bot)) - matrix_new = matrix_of_lines_ch[(matrix_of_lines_ch[:,6] > top) & - (matrix_of_lines_ch[:,7] < bot)] + matrix_new = matrix_of_seps_ch[(matrix_of_seps_ch[:,6] >= top) & + (matrix_of_seps_ch[:,7] < bot)] #print(len( matrix_new[:,9][matrix_new[:,9]==1] )) #print(matrix_new[:,8][matrix_new[:,9]==1],'gaddaaa') # check to see is there any vertical separator to find holes. @@ -1698,19 +1594,9 @@ def return_boxes_of_images_by_order_of_reading_new( # elongate horizontal separators+headings as much as possible without overlap args_nonver = matrix_new[:, 9] != 1 - regions_with_separators = np.copy(regions_without_separators[top:bot]) - for xmin, xmax, ymin, ymax in matrix_new[:, [2, 3, 6, 7]]: - regions_with_separators[ymin - top: ymax - top, xmin: xmax] = 6 - # def dbg_imshow(box, title): - # xmin, xmax, ymin, ymax = box - # plt.imshow(regions_with_separators, extent=[0, width_tot, bot, top]) - # plt.gca().add_patch(patches.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, - # fill=False, linewidth=1, edgecolor='r')) - # plt.title(title + " at %d:%d, %d:%d" % (ymin, ymax, xmin, xmax)) - # plt.show() for i in np.flatnonzero(args_nonver): xmin, xmax, ymin, ymax, typ = matrix_new[i, [2, 3, 6, 7, 9]] - cut = regions_with_separators[ymin - top: ymax - top] + cut = regions_with_separators[ymin: ymax] # dbg_imshow([xmin, xmax, ymin, ymax], "separator %d (%s)" % (i, "heading" if typ else "horizontal")) starting = xmin - peaks_neg_tot min_start = np.flatnonzero(starting >= 0)[-1] # last left-of @@ -1737,6 +1623,7 @@ def return_boxes_of_images_by_order_of_reading_new( args_hor = matrix_new[:, 9] == 0 x_min_hor_some = matrix_new[:, 2][args_hor] x_max_hor_some = matrix_new[:, 3][args_hor] + y_min_hor_some = matrix_new[:, 6][args_hor] y_max_hor_some = matrix_new[:, 7][args_hor] cy_hor_some = matrix_new[:, 5][args_hor] @@ -1752,412 +1639,144 @@ def return_boxes_of_images_by_order_of_reading_new( # (x +/- 30px to avoid crossing col peaks by accident) x_min_hor_some = np.append(x_min_hor_some, np.tile(x_min_hor_head + 30, 2)) x_max_hor_some = np.append(x_max_hor_some, np.tile(x_max_hor_head - 30, 2)) + y_min_hor_some = np.append(y_min_hor_some, # toplines + np.concatenate((y_min_hor_head - 2, + y_max_hor_head - 0))) y_max_hor_some = np.append(y_max_hor_some, # baselines - np.concatenate((y_min_hor_head + 2, + np.concatenate((y_min_hor_head + 0, y_max_hor_head + 2))) - cy_hor_some = np.append(cy_hor_some, # toplines - np.concatenate((y_min_hor_head - 2, - y_max_hor_head - 2))) + cy_hor_some = np.append(cy_hor_some, # centerlines + np.concatenate((y_min_hor_head - 1, + y_max_hor_head + 1))) + + # analyse connected components of regions to gain additional separators + # and prepare a map for cross-column boxes + ccounts = np.bincount(ccomps[top: bot].flatten()) + col_ccounts = np.stack([np.bincount(ccomps[top: bot, left: right].flatten(), + minlength=ccounts.size) + for left, right in pairwise(peaks_neg_tot)]) + labelcolmap = dict() + for label, label_count in enumerate(ccounts): + if not label: + continue + label_left, label_top, label_width, label_height, label_area = cstats[label] + # if label_count < 0.9 * label_area: + # # mostly not in this part of the page + # continue + if label_count < 0.01 * (top - bot) * width_tot: + continue + #assert np.sum(col_ccounts[:, label]) == label_count + label_right = label_left + label_width + label_bot = label_top + label_height + label_start = np.flatnonzero(peaks_neg_tot > label_left)[0] - 1 + label_end = np.flatnonzero(peaks_neg_tot >= label_right)[0] + # store as dict for multi-column boxes: + for start in range(label_start, label_end): + labelcolmap.setdefault(start, list()).append( + (label_end, label_top, label_bot, sum(col_ccounts[start: label_end, label]))) + # make additional separators: + if label_end - label_start < 2: + continue + if np.count_nonzero(col_ccounts[:, label] > 0.1 * label_count) < 2: + continue + x_min_hor_some = np.append(x_min_hor_some, [label_left] * 2) + x_max_hor_some = np.append(x_max_hor_some, [label_right] * 2) + y_min_hor_some = np.append(y_min_hor_some, [label_top - 2, label_bot]) + y_max_hor_some = np.append(y_max_hor_some, [label_top, label_bot + 2]) + cy_hor_some = np.append(cy_hor_some, [label_top - 1, label_bot + 1]) if right2left_readingorder: x_max_hor_some = width_tot - x_min_hor_some x_min_hor_some = width_tot - x_max_hor_some - - reading_order_type, x_starting, x_ending, y_mid, y_max, \ - y_mid_without_mother, x_start_without_mother, x_end_without_mother, \ - there_is_sep_with_child, \ - y_mid_with_child_without_mother, x_start_with_child_without_mother, x_end_with_child_without_mother, \ - new_main_sep_y = return_x_start_end_mothers_childs_and_type_of_reading_order( - peaks_neg_tot, x_min_hor_some, x_max_hor_some, cy_hor_some, y_max_hor_some) - - # show multi-column separators - # dbg_plt([0, None, top, bot], "multi-column separators in current split", + x_starting, x_ending, y_min, y_mid, y_max = return_multicol_separators_x_start_end( + regions_without_separators, peaks_neg_tot, top, bot, + x_min_hor_some, x_max_hor_some, cy_hor_some, y_min_hor_some, y_max_hor_some) + # dbg_plt([0, None, top, bot], "non-empty multi-column separators in current split", # list(zip(peaks_neg_tot[x_starting], peaks_neg_tot[x_ending], - # y_mid - top, y_max - top)), True) + # y_min - top, y_max - top)), True) - if (reading_order_type == 1 or - len(y_mid_without_mother) >= 2 or - there_is_sep_with_child == 1): - # there are top-level multi-colspan horizontal separators which overlap each other - # or multiple top-level multi-colspan horizontal separators - # or multi-colspan horizontal separators shorter than their respective top-level: - # todo: explain how this is dealt with - try: - y_grenze = top + 300 - up = (y_mid > top) & (y_mid <= y_grenze) - - args_early_ys=np.arange(len(y_mid)) - #print(args_early_ys,'args_early_ys') - #print(y_mid,'y_mid') - - x_starting_up = x_starting[up] - x_ending_up = x_ending[up] - y_mid_up = y_mid[up] - y_max_up = y_max[up] - args_up = args_early_ys[up] - #print(args_up,'args_up') - #print(y_mid_up,'y_mid_up') - #check if there is a big separator in this y_mains0 - if len(y_mid_up) > 0: - # is there a separator with full-width span? - main_separator = (x_starting_up == 0) & (x_ending_up == len(peaks_neg_tot) - 1) - y_mid_main_separator_up = y_mid_up[main_separator] - y_max_main_separator_up = y_max_up[main_separator] - args_main_to_deleted = args_up[main_separator] - #print(y_mid_main_separator_up,y_max_main_separator_up,args_main_to_deleted,'fffffjammmm') - if len(y_max_main_separator_up): - args_to_be_kept = np.array(list( set(args_early_ys) - set(args_main_to_deleted) )) - #print(args_to_be_kept,'args_to_be_kept') - boxes.append([0, peaks_neg_tot[-1], - top, y_max_main_separator_up.max()]) - # dbg_plt(boxes[-1], "near top main separator box") - top = y_max_main_separator_up.max() - - #print(top,'top') - y_mid = y_mid[args_to_be_kept] - x_starting = x_starting[args_to_be_kept] - x_ending = x_ending[args_to_be_kept] - y_max = y_max[args_to_be_kept] - - #print('galdiha') - y_grenze = top + 200 - up = (y_mid > top) & (y_mid <= y_grenze) - args_early_ys2 = np.arange(len(y_mid)) - x_starting_up = x_starting[up] - x_ending_up = x_ending[up] - y_mid_up = y_mid[up] - y_max_up = y_max[up] - args_up2 = args_early_ys2[up] - #print(y_mid_up,x_starting_up,x_ending_up,'didid') - else: - args_early_ys2 = args_early_ys - args_up2 = args_up - - nodes_in = set() - for ij in range(len(x_starting_up)): - nodes_in.update(range(x_starting_up[ij], - x_ending_up[ij])) - #print(nodes_in,'nodes_in') - #print(np.array(range(len(peaks_neg_tot)-1)),'np.array(range(len(peaks_neg_tot)-1))') - - if nodes_in == set(range(len(peaks_neg_tot)-1)): - pass - elif nodes_in == set(range(1, len(peaks_neg_tot)-1)): - pass - else: - #print('burdaydikh') - args_to_be_kept2 = np.array(list( set(args_early_ys2) - set(args_up2) )) - - if len(args_to_be_kept2): - #print(args_to_be_kept2, "args_to_be_kept2") - y_mid = y_mid[args_to_be_kept2] - x_starting = x_starting[args_to_be_kept2] - x_ending = x_ending[args_to_be_kept2] - y_max = y_max[args_to_be_kept2] - - #int(top) - # order multi-column separators - y_mid_by_order=[] - x_start_by_order=[] - x_end_by_order=[] - if (reading_order_type == 1 or - len(x_end_with_child_without_mother) == 0): - if reading_order_type == 1: - # there are top-level multi-colspan horizontal separators which overlap each other - #print("adding all columns at top because of multiple overlapping mothers") - y_mid_by_order.append(top) - x_start_by_order.append(0) - x_end_by_order.append(len(peaks_neg_tot)-2) - else: - # there are no top-level multi-colspan horizontal separators which themselves - # contain shorter multi-colspan separators - #print(x_start_without_mother,x_end_without_mother,peaks_neg_tot,'dodo') - columns_covered_by_mothers = set() - for dj in range(len(x_start_without_mother)): - columns_covered_by_mothers.update( - range(x_start_without_mother[dj], - x_end_without_mother[dj])) - columns_not_covered = list(all_columns - columns_covered_by_mothers) - #print(columns_covered_by_mothers, "columns_covered_by_mothers") - #print(columns_not_covered, "columns_not_covered") - y_mid = np.append(y_mid, np.ones(len(columns_not_covered) + - len(x_start_without_mother), - dtype=int) * top) - ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered)) - ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) - x_starting = np.append(x_starting, np.array(columns_not_covered, int)) - x_starting = np.append(x_starting, x_start_without_mother) - x_ending = np.append(x_ending, np.array(columns_not_covered, int) + 1) - x_ending = np.append(x_ending, x_end_without_mother) - - ind_args=np.arange(len(y_mid)) - #print(ind_args,'ind_args') - for column in range(len(peaks_neg_tot)-1): - #print(column,'column') - ind_args_in_col=ind_args[x_starting==column] - #print('babali2') - #print(ind_args_in_col,'ind_args_in_col') - #print(len(y_mid)) - y_mid_column=y_mid[ind_args_in_col] - x_start_column=x_starting[ind_args_in_col] - x_end_column=x_ending[ind_args_in_col] - #print('babali3') - ind_args_col_sorted=np.argsort(y_mid_column) - y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) - x_start_by_order.extend(x_start_column[ind_args_col_sorted]) - x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) - else: - #print(x_start_without_mother,x_end_without_mother,peaks_neg_tot,'dodo') - columns_covered_by_mothers = set() - for dj in range(len(x_start_without_mother)): - columns_covered_by_mothers.update( - range(x_start_without_mother[dj], - x_end_without_mother[dj])) - columns_not_covered = list(all_columns - columns_covered_by_mothers) - #print(columns_covered_by_mothers, "columns_covered_by_mothers") - #print(columns_not_covered, "columns_not_covered") - y_mid = np.append(y_mid, np.ones(len(columns_not_covered) + - len(x_start_without_mother), - dtype=int) * top) - ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered)) - ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) - x_starting = np.append(x_starting, np.array(columns_not_covered, int)) - x_starting = np.append(x_starting, x_start_without_mother) - x_ending = np.append(x_ending, np.array(columns_not_covered, int) + 1) - x_ending = np.append(x_ending, x_end_without_mother) - - columns_covered_by_mothers_with_child = set() - for dj in range(len(x_end_with_child_without_mother)): - columns_covered_by_mothers_with_child.update( - range(x_start_with_child_without_mother[dj], - x_end_with_child_without_mother[dj])) - #print(columns_covered_by_mothers_with_child, "columns_covered_by_mothers_with_child") - columns_not_covered_by_mothers_with_child = list( - all_columns - columns_covered_by_mothers_with_child) - #indexes_to_be_spanned=[] - for i_s in range(len(x_end_with_child_without_mother)): - columns_not_covered_by_mothers_with_child.append(x_start_with_child_without_mother[i_s]) - columns_not_covered_by_mothers_with_child = np.sort(columns_not_covered_by_mothers_with_child) - #print(columns_not_covered_by_mothers_with_child, "columns_not_covered_by_mothers_with_child") - ind_args = np.arange(len(y_mid)) - for i_s_nc in columns_not_covered_by_mothers_with_child: - if i_s_nc in x_start_with_child_without_mother: - # use only seps with mother's span ("biggest") - #print("i_s_nc", i_s_nc) - x_end_biggest_column = \ - x_end_with_child_without_mother[ - x_start_with_child_without_mother == i_s_nc][0] - args_all_biggest_seps = \ - ind_args[(x_starting == i_s_nc) & - (x_ending == x_end_biggest_column)] - y_mid_column_nc = y_mid[args_all_biggest_seps] - #print("%d:%d" % (i_s_nc, x_end_biggest_column), "columns covered by mother with child") - #x_start_column_nc = x_starting[args_all_biggest_seps] - #x_end_column_nc = x_ending[args_all_biggest_seps] - y_mid_column_nc = np.sort(y_mid_column_nc) - #print(y_mid_column_nc, "y_mid_column_nc (sorted)") - for nc_top, nc_bot in pairwise(np.append(y_mid_column_nc, bot)): - #print("i_c", i_c) - #print("%d:%d" % (nc_top, nc_bot), "y_mid_column_nc") - ind_all_seps_between_nm_wc = \ - ind_args[(y_mid > nc_top) & - (y_mid < nc_bot) & - (x_starting >= i_s_nc) & - (x_ending <= x_end_biggest_column)] - y_mid_all_between_nm_wc = y_mid[ind_all_seps_between_nm_wc] - x_starting_all_between_nm_wc = x_starting[ind_all_seps_between_nm_wc] - x_ending_all_between_nm_wc = x_ending[ind_all_seps_between_nm_wc] - - columns_covered_by_mothers = set() - for dj in range(len(ind_all_seps_between_nm_wc)): - columns_covered_by_mothers.update( - range(x_starting_all_between_nm_wc[dj], - x_ending_all_between_nm_wc[dj])) - #print(columns_covered_by_mothers, "columns_covered_by_mothers") - child_columns = set(range(i_s_nc, x_end_biggest_column)) - columns_not_covered = list(child_columns - columns_covered_by_mothers) - #print(child_columns, "child_columns") - #print(columns_not_covered, "columns_not_covered") - - if len(ind_all_seps_between_nm_wc): - biggest = np.argmax(x_ending_all_between_nm_wc - - x_starting_all_between_nm_wc) - #print(ind_all_seps_between_nm_wc, "ind_all_seps_between_nm_wc") - #print(biggest, "%d:%d" % (x_starting_all_between_nm_wc[biggest], - x_ending_all_between_nm_wc[biggest]), "biggest") - if columns_covered_by_mothers == set( - range(x_starting_all_between_nm_wc[biggest], - x_ending_all_between_nm_wc[biggest])): - # single biggest accounts for all covered columns alone, - # this separator should be extended to cover all - seps_too_close_to_top_separator = \ - ((y_mid_all_between_nm_wc > nc_top) & - (y_mid_all_between_nm_wc <= nc_top + 500)) - if (np.count_nonzero(seps_too_close_to_top_separator) and - np.count_nonzero(seps_too_close_to_top_separator) < - len(ind_all_seps_between_nm_wc)): - #print(seps_too_close_to_top_separator, "seps_too_close_to_top_separator") - y_mid_all_between_nm_wc = \ - y_mid_all_between_nm_wc[~seps_too_close_to_top_separator] - x_starting_all_between_nm_wc = \ - x_starting_all_between_nm_wc[~seps_too_close_to_top_separator] - x_ending_all_between_nm_wc = \ - x_ending_all_between_nm_wc[~seps_too_close_to_top_separator] - - y_mid_all_between_nm_wc = np.append( - y_mid_all_between_nm_wc, nc_top) - x_starting_all_between_nm_wc = np.append( - x_starting_all_between_nm_wc, i_s_nc) - x_ending_all_between_nm_wc = np.append( - x_ending_all_between_nm_wc, x_end_biggest_column) - else: - y_mid_all_between_nm_wc = np.append( - y_mid_all_between_nm_wc, nc_top) - x_starting_all_between_nm_wc = np.append( - x_starting_all_between_nm_wc, x_starting_all_between_nm_wc[biggest]) - x_ending_all_between_nm_wc = np.append( - x_ending_all_between_nm_wc, x_ending_all_between_nm_wc[biggest]) - - if len(columns_not_covered): - y_mid_all_between_nm_wc = np.append( - y_mid_all_between_nm_wc, [nc_top] * len(columns_not_covered)) - x_starting_all_between_nm_wc = np.append( - x_starting_all_between_nm_wc, np.array(columns_not_covered, int)) - x_ending_all_between_nm_wc = np.append( - x_ending_all_between_nm_wc, np.array(columns_not_covered, int) + 1) - - ind_args_between=np.arange(len(x_ending_all_between_nm_wc)) - for column in range(int(i_s_nc), int(x_end_biggest_column)): - ind_args_in_col=ind_args_between[x_starting_all_between_nm_wc==column] - #print('babali2') - #print(ind_args_in_col,'ind_args_in_col') - #print(len(y_mid)) - y_mid_column=y_mid_all_between_nm_wc[ind_args_in_col] - x_start_column=x_starting_all_between_nm_wc[ind_args_in_col] - x_end_column=x_ending_all_between_nm_wc[ind_args_in_col] - #print('babali3') - ind_args_col_sorted=np.argsort(y_mid_column) - y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) - x_start_by_order.extend(x_start_column[ind_args_col_sorted]) - x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) - else: - #print(i_s_nc,'column not covered by mothers with child') - ind_args_in_col=ind_args[x_starting==i_s_nc] - #print('babali2') - #print(ind_args_in_col,'ind_args_in_col') - #print(len(y_mid)) - y_mid_column=y_mid[ind_args_in_col] - x_start_column=x_starting[ind_args_in_col] - x_end_column=x_ending[ind_args_in_col] - #print('babali3') - ind_args_col_sorted = np.argsort(y_mid_column) - y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) - x_start_by_order.extend(x_start_column[ind_args_col_sorted]) - x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) - - # create single-column boxes from multi-column separators - y_mid_by_order = np.array(y_mid_by_order) - x_start_by_order = np.array(x_start_by_order) - x_end_by_order = np.array(x_end_by_order) - for il in range(len(y_mid_by_order)): - #print(il, "il") - y_mid_itself = y_mid_by_order[il] - x_start_itself = x_start_by_order[il] - x_end_itself = x_end_by_order[il] - for column in range(int(x_start_itself), int(x_end_itself)+1): - #print(column,'cols') - #print('burda') - #print('burda2') - y_mid_next = y_mid_by_order[(y_mid_itself < y_mid_by_order) & - (column >= x_start_by_order) & - (column <= x_end_by_order)] - y_mid_next = y_mid_next.min(initial=bot) - #print(y_mid_next,'y_mid_next') - #print(y_mid_itself,'y_mid_itself') + # core algorithm: + # 1. iterate through multi-column separators, pre-ordered by their y coord + # 2. for each separator, iterate from its starting to its ending column + # 3. in each starting column, determine the next downwards separator, + # 4. if there is none, then fill up the column to the bottom; + # otherwise, fill up to that next separator + # 5. moreover, determine the next rightward column that would not cut through + # any regions, advancing to that column, and storing a new in-order bbox + # for that down/right span + # 6. if there was a next separator, and it ends no further than the current one, + # then recurse on that separator from step 1, then continue (with the next + # column for the current separator) at step 2, or (with the next separator + # in order) at step 1 + args = list(range(len(y_mid))) + while len(args): + cur = args[0] + args = args[1:] + # print("iter", cur, y_mid[cur], "%d:%d" % (x_starting[cur], x_ending[cur])) + def get_span(start, y_top, y_bot): + # for last, l_top, l_bot, l_count in labelcolmap.get(start, []): + # if y_top < l_bot and y_bot > l_top and last > start + 1: + # width = (peaks_neg_tot[last] - peaks_neg_tot[start]) + # print("span", start, last, l_top, l_bot, l_count, + # "box area", (y_bot - y_top) * width, + # "label area", (min(y_bot, l_bot) - max(y_top, l_top)) * width, + # "box height", (y_bot - y_top), + # "label height", sum(regions_without_separators[ + # y_top: y_bot, peaks_neg_tot[start + 1]])) + return min((last for last, l_top, l_bot, l_count in labelcolmap.get(start, []) + # yield the right-most column that does not cut through + # any regions in this horizontal span + if y_top < l_bot and y_bot > l_top + # Ignore if it ends here, anyway + and last > start + 1 + # Ensure this is not just a tiny region near larger regions + and l_count > 0.1 * max(l_count2 for _, l_top2, l_bot2, l_count2 in labelcolmap[start] + if y_top < l_bot2 and y_bot > l_top2) + # or just a small cut of the respective region + # (i.e. box should cover at least 10% of the label). + and ((min(y_bot, l_bot) - max(y_top, l_top)) * + (peaks_neg_tot[last] - peaks_neg_tot[start])) > 0.1 * l_count + # But do allow cutting tiny passages with less 10% of height + # (i.e. label is already almost separated by columns) + and sum(regions_without_separators[ + y_top: y_bot, peaks_neg_tot[start + 1]]) > 0.1 * (y_bot - y_top)), + # Otherwise advance only 1 column. + default=start + 1) + def add_sep(cur): + column = x_starting[cur] + while column < x_ending[cur]: + nxt = np.flatnonzero((y_mid[cur] < y_mid) & + (column >= x_starting) & + (column < x_ending)) + if len(nxt): + nxt = nxt[0] + # print("column", column) + last = get_span(column, y_max[cur], y_min[nxt]) + last = min(last, x_ending[nxt], x_ending[cur]) + # print("nxt", nxt, y_mid[nxt], "%d:%d" % (column, last)) boxes.append([peaks_neg_tot[column], - peaks_neg_tot[column+1], - y_mid_itself, - y_mid_next]) - # dbg_plt(boxes[-1], "A column %d box" % (column + 1)) - except: - logger.exception("cannot assign boxes") - boxes.append([0, peaks_neg_tot[len(peaks_neg_tot)-1], - top, bot]) - # dbg_plt(boxes[-1], "fallback box") - else: - # order multi-column separators - y_mid_by_order=[] - x_start_by_order=[] - x_end_by_order=[] - if len(x_starting)>0: - columns_covered_by_seps_covered_more_than_2col = set() - for dj in range(len(x_starting)): - if set(range(x_starting[dj], x_ending[dj])) != all_columns: - columns_covered_by_seps_covered_more_than_2col.update( - range(x_starting[dj], x_ending[dj])) - columns_not_covered = list(all_columns - columns_covered_by_seps_covered_more_than_2col) - - y_mid = np.append(y_mid, np.ones(len(columns_not_covered) + 1, - dtype=int) * top) - ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered)) - ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) - x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype)) - x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1) - if len(new_main_sep_y) > 0: - x_starting = np.append(x_starting, 0) - x_ending = np.append(x_ending, len(peaks_neg_tot) - 1) - else: - x_starting = np.append(x_starting, x_starting[0]) - x_ending = np.append(x_ending, x_ending[0]) - else: - columns_not_covered = list(all_columns) - y_mid = np.append(y_mid, np.ones(len(columns_not_covered), - dtype=int) * top) - ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered)) - ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) - x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype)) - x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1) - - ind_args = np.arange(len(y_mid)) - - for column in range(len(peaks_neg_tot)-1): - #print(column,'column') - ind_args_in_col=ind_args[x_starting==column] - #print(len(y_mid)) - y_mid_column=y_mid[ind_args_in_col] - x_start_column=x_starting[ind_args_in_col] - x_end_column=x_ending[ind_args_in_col] - - ind_args_col_sorted = np.argsort(y_mid_column) - y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) - x_start_by_order.extend(x_start_column[ind_args_col_sorted]) - x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) - - # create single-column boxes from multi-column separators - y_mid_by_order = np.array(y_mid_by_order) - x_start_by_order = np.array(x_start_by_order) - x_end_by_order = np.array(x_end_by_order) - for il in range(len(y_mid_by_order)): - #print(il, "il") - y_mid_itself = y_mid_by_order[il] - #print(y_mid_itself,'y_mid_itself') - x_start_itself = x_start_by_order[il] - x_end_itself = x_end_by_order[il] - for column in range(x_start_itself, x_end_itself+1): - #print(column,'cols') - #print('burda2') - y_mid_next = y_mid_by_order[(y_mid_itself < y_mid_by_order) & - (column >= x_start_by_order) & - (column <= x_end_by_order)] - #print(y_mid_next,'y_mid_next') - y_mid_next = y_mid_next.min(initial=bot) - #print(y_mid_next,'y_mid_next') - boxes.append([peaks_neg_tot[column], - peaks_neg_tot[column+1], - y_mid_itself, - y_mid_next]) - # dbg_plt(boxes[-1], "B column %d box" % (column + 1)) + peaks_neg_tot[last], + y_mid[cur], + y_mid[nxt]]) + # dbg_plt(boxes[-1], "recursive column %d:%d box [%d]" % (column, last, len(boxes))) + column = last + if last == x_ending[nxt] and x_ending[nxt] <= x_ending[cur] and nxt in args: + # child – recur + # print("recur", nxt, y_mid[nxt], "%d:%d" % (x_starting[nxt], x_ending[nxt])) + args.remove(nxt) + add_sep(nxt) + else: + # print("column", column) + last = get_span(column, y_max[cur], bot) + # print("bot", bot, "%d:%d" % (column, last)) + boxes.append([peaks_neg_tot[column], + peaks_neg_tot[last], + y_mid[cur], + bot]) + # dbg_plt(boxes[-1], "non-recursive column %d box [%d]" % (column, len(boxes))) + column = last + add_sep(cur) if right2left_readingorder: peaks_neg_tot_tables_new = [] From 4475183f08d2c25eb90deb04bda552930abd4ba0 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 14 Nov 2025 03:39:36 +0100 Subject: [PATCH 25/91] improve rules governing column split - reduce `sigma` for smoothing of input to `find_peaks` (so we get deeper gaps between columns) - allow column boundaries closer to the margins (50 instead of 100 or 200 px, 170 instead of 370 px) - allow column boundaries closer to each other (300 instead of 400 px) - add a secondary `grenze` criterion for depth of gap (relative to lowest minimum, if that is smaller than the old criterion relative to lowest maximum) - for calls to `find_num_col` within parts of a page, do allow unbalanced column boundaries --- src/eynollah/utils/__init__.py | 113 +++++++++++++++++++-------------- 1 file changed, 66 insertions(+), 47 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index e00004f..570eefe 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -241,7 +241,7 @@ def find_num_col_deskew(regions_without_separators, sigma_, multiplier=3.8): z = gaussian_filter1d(regions_without_separators_0, sigma_) return np.std(z) -def find_num_col(regions_without_separators, num_col_classifier, tables, multiplier=3.8): +def find_num_col(regions_without_separators, num_col_classifier, tables, multiplier=3.8, unbalanced=False): if not regions_without_separators.any(): return 0, [] regions_without_separators_0 = regions_without_separators.sum(axis=0) @@ -249,13 +249,15 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl # ax1.imshow(regions_without_separators, aspect="auto") # ax2.plot(regions_without_separators_0) # plt.show() - sigma_ = 35 # 70#35 + sigma_ = 25 # 70#35 meda_n_updown = regions_without_separators_0[::-1] first_nonzero = next((i for i, x in enumerate(regions_without_separators_0) if x), 0) last_nonzero = next((i for i, x in enumerate(meda_n_updown) if x), 0) last_nonzero = len(regions_without_separators_0) - last_nonzero - last_nonzero = last_nonzero - 100 - first_nonzero = first_nonzero + 200 + last_nonzero = last_nonzero - 50 #- 100 + first_nonzero = first_nonzero + 50 #+ 200 + last_offmargin = len(regions_without_separators_0) - 170 #370 + first_offmargin = 170 #370 y = regions_without_separators_0 # [first_nonzero:last_nonzero] y_help = np.zeros(len(y) + 20) y_help[10 : len(y) + 10] = y @@ -285,26 +287,34 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl # ax2.axvline(last_nonzero, label="last nonzero") # ax2.text(first_nonzero, 0, "first nonzero", rotation=90) # ax2.text(last_nonzero, 0, "last nonzero", rotation=90) - # ax2.axvline(370, label="first") - # ax2.axvline(len(y) - 370, label="last") - # ax2.text(370, 0, "first", rotation=90) - # ax2.text(len(y) - 370, 0, "last", rotation=90) + # ax2.axvline(first_offmargin, label="first offmargin") + # ax2.axvline(last_offmargin, label="last offmargin") + # ax2.text(first_offmargin, 0, "first offmargin", rotation=90) + # ax2.text(last_offmargin, 0, "last offmargin", rotation=90) # plt.show() peaks_neg = peaks_neg - 10 - 10 + # print("raw peaks", peaks) peaks = peaks[(peaks > 0.06 * len(y)) & (peaks < 0.94 * len(y))] + # print("non-marginal peaks", peaks) interest_pos = z[peaks] + # print("interest_pos", interest_pos) interest_pos = interest_pos[interest_pos > 10] if not interest_pos.any(): return 0, [] + # plt.plot(z) # plt.show() + #print("raw peaks_neg", peaks_neg) peaks_neg = peaks_neg[(peaks_neg > first_nonzero) & (peaks_neg < last_nonzero)] - peaks_neg = peaks_neg[(peaks_neg > 370) & - (peaks_neg < len(y) - 370)] + #print("non-zero peaks_neg", peaks_neg) + peaks_neg = peaks_neg[(peaks_neg > first_offmargin) & + (peaks_neg < last_offmargin)] + #print("non-marginal peaks_neg", peaks_neg) interest_neg = z[peaks_neg] + #print("interest_neg", interest_neg) if not interest_neg.any(): return 0, [] @@ -317,10 +327,14 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl min_peaks_neg = 0 # np.min(interest_neg) + # cutoff criterion: fixed fraction of lowest column height dis_talaei = (min_peaks_pos - min_peaks_neg) / multiplier grenze = min_peaks_pos - dis_talaei #np.mean(y[peaks_neg[0]:peaks_neg[-1]])-np.std(y[peaks_neg[0]:peaks_neg[-1]])/2.0 + # extra criterion: fixed multiple of lowest gap height + grenze = min(grenze, multiplier * (5 + np.min(interest_neg))) + # print(interest_neg,'interest_neg') # print(grenze,'grenze') # print(min_peaks_pos,'min_peaks_pos') @@ -356,18 +370,20 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl # print(peaks_neg_fin,'peaks_neg_fin') # print(num_col,'diz') # cancel if resulting split is highly unbalanced across available width - if ((num_col == 3 and - ((peaks_neg_fin[0] > 0.75 * len(y) and - peaks_neg_fin[1] > 0.75 * len(y)) or - (peaks_neg_fin[0] < 0.25 * len(y) and - peaks_neg_fin[1] < 0.25 * len(y)) or - (peaks_neg_fin[0] < 0.5 * len(y) - 200 and - peaks_neg_fin[1] < 0.5 * len(y)) or - (peaks_neg_fin[0] > 0.5 * len(y) + 200 and - peaks_neg_fin[1] > 0.5 * len(y)))) or - (num_col == 2 and - (peaks_neg_fin[0] > 0.75 * len(y) or - peaks_neg_fin[0] < 0.25 * len(y)))): + if unbalanced: + pass + elif ((num_col == 3 and + ((peaks_neg_fin[0] > 0.75 * len(y) and + peaks_neg_fin[1] > 0.75 * len(y)) or + (peaks_neg_fin[0] < 0.25 * len(y) and + peaks_neg_fin[1] < 0.25 * len(y)) or + (peaks_neg_fin[0] < 0.5 * len(y) - 200 and + peaks_neg_fin[1] < 0.5 * len(y)) or + (peaks_neg_fin[0] > 0.5 * len(y) + 200 and + peaks_neg_fin[1] > 0.5 * len(y)))) or + (num_col == 2 and + (peaks_neg_fin[0] > 0.75 * len(y) or + peaks_neg_fin[0] < 0.25 * len(y)))): num_col = 1 peaks_neg_fin = [] @@ -376,7 +392,7 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl # filter out peaks that are too close (<400px) to each other: # among each group, pick the position with smallest amount of text diff_peaks = np.abs(np.diff(peaks_neg_fin)) - cut_off = 400 + cut_off = 300 #400 peaks_neg_true = [] forest = [] # print(len(peaks_neg_fin),'len_') @@ -401,30 +417,32 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl #print(peaks_neg_true, "peaks_neg_true") ##print(num_col,'early') # cancel if resulting split is highly unbalanced across available width - if ((num_col == 3 and - ((peaks_neg_true[0] > 0.75 * len(y) and - peaks_neg_true[1] > 0.75 * len(y)) or - (peaks_neg_true[0] < 0.25 * len(y) and - peaks_neg_true[1] < 0.25 * len(y)) or - (peaks_neg_true[0] < 0.5 * len(y) - 200 and - peaks_neg_true[1] < 0.5 * len(y)) or - (peaks_neg_true[0] > 0.5 * len(y) + 200 and - peaks_neg_true[1] > 0.5 * len(y)))) or - (num_col == 2 and - (peaks_neg_true[0] > 0.75 * len(y) or - peaks_neg_true[0] < 0.25 * len(y)))): + if unbalanced: + pass + elif ((num_col == 3 and + ((peaks_neg_true[0] > 0.75 * len(y) and + peaks_neg_true[1] > 0.75 * len(y)) or + (peaks_neg_true[0] < 0.25 * len(y) and + peaks_neg_true[1] < 0.25 * len(y)) or + (peaks_neg_true[0] < 0.5 * len(y) - 200 and + peaks_neg_true[1] < 0.5 * len(y)) or + (peaks_neg_true[0] > 0.5 * len(y) + 200 and + peaks_neg_true[1] > 0.5 * len(y)))) or + (num_col == 2 and + (peaks_neg_true[0] > 0.75 * len(y) or + peaks_neg_true[0] < 0.25 * len(y)))): num_col = 1 peaks_neg_true = [] - if (num_col == 3 and - (peaks_neg_true[0] < 0.75 * len(y) and - peaks_neg_true[0] > 0.25 * len(y) and - peaks_neg_true[1] > 0.80 * len(y))): + elif (num_col == 3 and + (peaks_neg_true[0] < 0.75 * len(y) and + peaks_neg_true[0] > 0.25 * len(y) and + peaks_neg_true[1] > 0.80 * len(y))): num_col = 2 peaks_neg_true = [peaks_neg_true[0]] - if (num_col == 3 and - (peaks_neg_true[1] < 0.75 * len(y) and - peaks_neg_true[1] > 0.25 * len(y) and - peaks_neg_true[0] < 0.20 * len(y))): + elif (num_col == 3 and + (peaks_neg_true[1] < 0.75 * len(y) and + peaks_neg_true[1] > 0.25 * len(y) and + peaks_neg_true[0] < 0.20 * len(y))): num_col = 2 peaks_neg_true = [peaks_neg_true[1]] @@ -1151,8 +1169,8 @@ def order_of_regions(textline_mask, contours_main, contours_head, y_ref, x_ref): ##matrix_of_orders[:len_main,4]=final_indexers_sorted[:] - # assert len(final_indexers_sorted) == len(contours_main) + len(contours_head) - # assert not len(final_indexers_sorted) or max(final_index_type) == max(len(contours_main) + assert len(set(final_indexers_sorted)) == len(contours_main) + len(contours_head) + assert set(final_index_type) == set(range(len(contours_main))).union(range(len(contours_head))) return np.array(final_indexers_sorted), np.array(final_types), np.array(final_index_type) @@ -1518,7 +1536,8 @@ def return_boxes_of_images_by_order_of_reading_new( regions_without_separators[top:bot], # we do not expect to get all columns in small parts (headings etc.): num_col_classifier if bot - top >= big_part else 1, - tables, multiplier=6. if erosion_hurts else 7.) + tables, multiplier=6. if erosion_hurts else 7., + unbalanced=True) except: peaks_neg_fin=[] num_col = 0 @@ -1534,7 +1553,7 @@ def return_boxes_of_images_by_order_of_reading_new( if len(peaks_neg_fin)==0: num_col, peaks_neg_fin = find_num_col( regions_without_separators[top:bot], - num_col_classifier, tables, multiplier=3.) + num_col_classifier, tables, multiplier=3., unbalanced=True) #print(peaks_neg_fin,'peaks_neg_fin') peaks_neg_fin_early = [0] + peaks_neg_fin + [width_tot-1] From 3c15c4f7d4bf03fee11c54da82ba7d29f09ada5a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 15 Nov 2025 14:29:41 +0100 Subject: [PATCH 26/91] back to `rotate_image` instead of `rotation_image_new` for deskewing (because the latter does not preserve coordinates; it scales, even when resizing the image; this caused coordinate problems when matching deskewed contours) --- src/eynollah/eynollah.py | 58 +++++++++------------------------------- 1 file changed, 13 insertions(+), 45 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 2bdb2c7..efd67d5 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -88,12 +88,7 @@ from .utils.contour import ( join_polygons, make_intersection, ) -from .utils.rotate import ( - rotate_image, - rotation_not_90_func, - rotation_not_90_func_full_layout, - rotation_image_new -) +from .utils.rotate import rotate_image from .utils.utils_ocr import ( return_start_and_end_of_common_text_of_textline_ocr_without_common_section, return_textline_contour_with_added_box_coordinate, @@ -3131,11 +3126,9 @@ class Eynollah: self.logger.debug('enter run_boxes_no_full_layout') t_0_box = time.time() if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - _, textline_mask_tot_d, text_regions_p_d, table_prediction_n = rotation_not_90_func( - image_page, textline_mask_tot, text_regions_p, table_prediction, slope_deskew) - text_regions_p_d = resize_image(text_regions_p_d, text_regions_p.shape[0], text_regions_p.shape[1]) - textline_mask_tot_d = resize_image(textline_mask_tot_d, text_regions_p.shape[0], text_regions_p.shape[1]) - table_prediction_n = resize_image(table_prediction_n, text_regions_p.shape[0], text_regions_p.shape[1]) + textline_mask_tot_d = rotate_image(textline_mask_tot, slope_deskew) + text_regions_p_d = rotate_image(text_regions_p, slope_deskew) + table_prediction_n = rotate_image(table_prediction, slope_deskew) regions_without_separators_d = (text_regions_p_d[:, :] == 1) * 1 if self.tables: regions_without_separators_d[table_prediction_n[:,:] == 1] = 1 @@ -3276,20 +3269,9 @@ class Eynollah: text_regions_p[:,:][table_prediction[:,:]==1] = 10 img_revised_tab = text_regions_p[:,:] if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - _, textline_mask_tot_d, text_regions_p_d, table_prediction_n = \ - rotation_not_90_func(image_page, textline_mask_tot, text_regions_p, - table_prediction, slope_deskew) - - text_regions_p_d = resize_image(text_regions_p_d, - text_regions_p.shape[0], - text_regions_p.shape[1]) - textline_mask_tot_d = resize_image(textline_mask_tot_d, - text_regions_p.shape[0], - text_regions_p.shape[1]) - table_prediction_n = resize_image(table_prediction_n, - text_regions_p.shape[0], - text_regions_p.shape[1]) - + textline_mask_tot_d = rotate_image(textline_mask_tot, slope_deskew) + text_regions_p_d = rotate_image(text_regions_p, slope_deskew) + table_prediction_n = rotate_image(table_prediction, slope_deskew) regions_without_separators_d = (text_regions_p_d[:,:] == 1)*1 regions_without_separators_d[table_prediction_n[:,:] == 1] = 1 else: @@ -3303,20 +3285,9 @@ class Eynollah: else: if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - _, textline_mask_tot_d, text_regions_p_d, table_prediction_n = \ - rotation_not_90_func(image_page, textline_mask_tot, text_regions_p, - table_prediction, slope_deskew) - - text_regions_p_d = resize_image(text_regions_p_d, - text_regions_p.shape[0], - text_regions_p.shape[1]) - textline_mask_tot_d = resize_image(textline_mask_tot_d, - text_regions_p.shape[0], - text_regions_p.shape[1]) - table_prediction_n = resize_image(table_prediction_n, - text_regions_p.shape[0], - text_regions_p.shape[1]) - + textline_mask_tot_d = rotate_image(textline_mask_tot, slope_deskew) + text_regions_p_d = rotate_image(text_regions_p, slope_deskew) + table_prediction_n = rotate_image(table_prediction, slope_deskew) regions_without_separators_d = (text_regions_p_d[:,:] == 1)*1 regions_without_separators_d[table_prediction_n[:,:] == 1] = 1 else: @@ -3465,12 +3436,9 @@ class Eynollah: #plt.show() ####if not self.tables: if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - _, textline_mask_tot_d, text_regions_p_d, regions_fully_n = rotation_not_90_func_full_layout( - image_page, textline_mask_tot, text_regions_p, regions_fully, slope_deskew) - - text_regions_p_d = resize_image(text_regions_p_d, text_regions_p.shape[0], text_regions_p.shape[1]) - textline_mask_tot_d = resize_image(textline_mask_tot_d, text_regions_p.shape[0], text_regions_p.shape[1]) - regions_fully_n = resize_image(regions_fully_n, text_regions_p.shape[0], text_regions_p.shape[1]) + textline_mask_tot_d = rotate_image(textline_mask_tot, slope_deskew) + text_regions_p_d = rotate_image(text_regions_p, slope_deskew) + regions_fully_n = rotate_image(regions_fully, slope_deskew) if not self.tables: regions_without_separators_d = (text_regions_p_d[:, :] == 1) * 1 else: From 5a778003fde3cc540f3b8b1c00bc6eebee1f9295 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 15 Nov 2025 14:32:22 +0100 Subject: [PATCH 27/91] contour matching for deskewed image: ensure matches for both sides --- src/eynollah/eynollah.py | 42 +++++++++++++++++++++++++++++++--------- 1 file changed, 33 insertions(+), 9 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index efd67d5..b7c6ddf 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -4461,42 +4461,42 @@ class Eynollah: dists[i] = np.linalg.norm(centers[:, i:i + 1] - centers_d, axis=0) corresp = np.zeros(dists.shape, dtype=bool) # keep searching next-closest until at least one correspondence on each side - while not np.all(corresp.sum(axis=1)) and not np.all(corresp.sum(axis=0)): + while not np.all(corresp.sum(axis=1)) or not np.all(corresp.sum(axis=0)): idx = np.nanargmin(dists) i, j = np.unravel_index(idx, dists.shape) dists[i, j] = np.nan corresp[i, j] = True - #print("original/deskewed adjacency", corresp.nonzero()) + # print("original/deskewed adjacency", corresp.nonzero()) contours_only_text_parent_d_ordered = np.zeros_like(contours_only_text_parent) contours_only_text_parent_d_ordered = contours_only_text_parent_d[np.argmax(corresp, axis=1)] # img1 = np.zeros(text_only_d.shape[:2], dtype=np.uint8) # for i in range(len(contours_only_text_parent)): # cv2.fillPoly(img1, pts=[contours_only_text_parent_d_ordered[i]], color=i + 1) - # plt.subplot(2, 2, 1, title="direct corresp contours") + # plt.subplot(1, 4, 1, title="direct corresp contours") # plt.imshow(img1) # img2 = np.zeros(text_only_d.shape[:2], dtype=np.uint8) # join deskewed regions mapping to single original ones for i in range(len(contours_only_text_parent)): if np.count_nonzero(corresp[i]) > 1: indices = np.flatnonzero(corresp[i]) - #print("joining", indices) + # print("joining", indices) polygons_d = [contour2polygon(contour) for contour in contours_only_text_parent_d[indices]] contour_d = polygon2contour(join_polygons(polygons_d)) contours_only_text_parent_d_ordered[i] = contour_d # cv2.fillPoly(img2, pts=[contour_d], color=i + 1) - # plt.subplot(2, 2, 3, title="joined contours") + # plt.subplot(1, 4, 2, title="joined contours") # plt.imshow(img2) # img3 = np.zeros(text_only_d.shape[:2], dtype=np.uint8) # split deskewed regions mapping to multiple original ones def deskew(polygon): polygon = shapely.affinity.rotate(polygon, -slope_deskew, origin=center) - polygon = shapely.affinity.translate(polygon, *offset.squeeze()) + #polygon = shapely.affinity.translate(polygon, *offset.squeeze()) return polygon for j in range(len(contours_only_text_parent_d)): if np.count_nonzero(corresp[:, j]) > 1: indices = np.flatnonzero(corresp[:, j]) - #print("splitting along", indices) + # print("splitting along", indices) polygons = [deskew(contour2polygon(contour)) for contour in contours_only_text_parent[indices]] polygon_d = contour2polygon(contours_only_text_parent_d[j]) @@ -4509,14 +4509,38 @@ class Eynollah: if polygon_d] contours_only_text_parent_d_ordered[indices] = contours_d # cv2.fillPoly(img3, pts=contours_d, color=j + 1) - # plt.subplot(2, 2, 4, title="split contours") + # plt.subplot(1, 4, 3, title="split contours") # plt.imshow(img3) # img4 = np.zeros(text_only_d.shape[:2], dtype=np.uint8) # for i in range(len(contours_only_text_parent)): # cv2.fillPoly(img4, pts=[contours_only_text_parent_d_ordered[i]], color=i + 1) - # plt.subplot(2, 2, 2, title="result contours") + # plt.subplot(1, 4, 4, title="result contours") # plt.imshow(img4) # plt.show() + # from matplotlib import patches as ptchs + # plt.subplot(1, 2, 1, title="undeskewed") + # plt.imshow(text_only) + # centers = np.stack(find_center_of_contours(contours_only_text_parent)) # [2, N] + # for i in range(len(contours_only_text_parent)): + # cnt = contours_only_text_parent[i] + # ctr = centers[:, i] + # plt.gca().add_patch(ptchs.Polygon(cnt[:, 0], closed=False, fill=False, color='blue')) + # plt.gca().scatter(ctr[0], ctr[1], 20, c='blue', marker='x') + # plt.gca().text(ctr[0], ctr[1], str(i), c='blue') + # plt.subplot(1, 2, 2, title="deskewed") + # plt.imshow(text_only_d) + # centers_d = np.stack(find_center_of_contours(contours_only_text_parent_d_ordered)) # [2, N] + # for i in range(len(contours_only_text_parent)): + # cnt = contours_only_text_parent[i] + # cnt = polygon2contour(deskew(contour2polygon(cnt))) + # plt.gca().add_patch(ptchs.Polygon(cnt[:, 0], closed=False, fill=False, color='blue')) + # for i in range(len(contours_only_text_parent_d_ordered)): + # cnt = contours_only_text_parent_d_ordered[i] + # ctr = centers_d[:, i] + # plt.gca().add_patch(ptchs.Polygon(cnt[:, 0], closed=False, fill=False, color='red')) + # plt.gca().scatter(ctr[0], ctr[1], 20, c='red', marker='x') + # plt.gca().text(ctr[0], ctr[1], str(i), c='red') + # plt.show() if not len(contours_only_text_parent): # stop early From 72d059f3c973b942945b62d4463a6ea031043efc Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 15 Nov 2025 14:34:12 +0100 Subject: [PATCH 28/91] reading order: simplify assignment / counting - `do_order_of_regions`: simplify aggregating per-box orders for paragraphs and headings to overall order passed to `xml_reading_order`; no need for `order_and_id_of_texts`, no need to return `id_of_texts_tot` - `do_order_of_regions_with_model`: no need to return `region_ids` - writer: no need to pass `id_of_texts_tot` in `build_pagexml` --- src/eynollah/eynollah.py | 70 +++++++++++++--------------------- src/eynollah/utils/__init__.py | 1 + src/eynollah/writer.py | 6 +-- 3 files changed, 30 insertions(+), 47 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index b7c6ddf..6024646 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -134,7 +134,6 @@ from .utils import ( return_boxes_of_images_by_order_of_reading_new ) from .utils.pil_cv2 import check_dpi, pil2cv -from .utils.xml import order_and_id_of_texts from .plot import EynollahPlotter from .writer import EynollahXmlWriter @@ -2546,9 +2545,7 @@ class Eynollah: args_contours_head = np.arange(len(contours_only_text_parent_h)) order_by_con_head = np.zeros_like(arg_text_con_head) - ref_point = 0 - order_of_texts_tot = [] - id_of_texts_tot = [] + idx = 0 for iij, box in enumerate(boxes): ys = slice(*box[2:4]) xs = slice(*box[0:2]) @@ -2557,37 +2554,25 @@ class Eynollah: con_inter_box = contours_only_text_parent[args_contours_box_main] con_inter_box_h = contours_only_text_parent_h[args_contours_box_head] - indexes_sorted, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( + _, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, box[2], box[0]) - order_of_texts, id_of_texts = order_and_id_of_texts( - con_inter_box, con_inter_box_h, - indexes_sorted, index_by_kind_sorted, kind_of_texts_sorted, ref_point) + for tidx, kind in zip(index_by_kind_sorted, kind_of_texts_sorted): + if kind == 1: + # print(iij, "main", args_contours_box_main[tidx], "becomes", idx) + order_by_con_main[args_contours_box_main[tidx]] = idx + else: + # print(iij, "head", args_contours_box_head[tidx], "becomes", idx) + order_by_con_head[args_contours_box_head[tidx]] = idx + idx += 1 - indexes_sorted_main = indexes_sorted[kind_of_texts_sorted == 1] - indexes_by_type_main = index_by_kind_sorted[kind_of_texts_sorted == 1] - indexes_sorted_head = indexes_sorted[kind_of_texts_sorted == 2] - indexes_by_type_head = index_by_kind_sorted[kind_of_texts_sorted == 2] - - for zahler, _ in enumerate(args_contours_box_main): - arg_order_v = indexes_sorted_main[zahler] - order_by_con_main[args_contours_box_main[indexes_by_type_main[zahler]]] = \ - np.flatnonzero(indexes_sorted == arg_order_v) + ref_point - - for zahler, _ in enumerate(args_contours_box_head): - arg_order_v = indexes_sorted_head[zahler] - order_by_con_head[args_contours_box_head[indexes_by_type_head[zahler]]] = \ - np.flatnonzero(indexes_sorted == arg_order_v) + ref_point - - for jji in range(len(id_of_texts)): - order_of_texts_tot.append(order_of_texts[jji] + ref_point) - id_of_texts_tot.append(id_of_texts[jji]) - ref_point += len(id_of_texts) - - order_of_texts_tot = np.concatenate((order_by_con_main, - order_by_con_head)) - order_text_new = np.argsort(order_of_texts_tot) - return order_text_new, id_of_texts_tot + # xml writer will create region ids in order of + # - contours_only_text_parent (main text), followed by + # - contours_only_text_parent (headings), + # and then create regionrefs into these ordered by order_text_new + order_text_new = np.argsort(np.concatenate((order_by_con_main, + order_by_con_head))) + return order_text_new try: results = match_boxes(False) @@ -3600,7 +3585,7 @@ class Eynollah: co_text_all = contours_only_text_parent if not len(co_text_all): - return [], [] + return [] labels_con = np.zeros((int(y_len /6.), int(x_len/6.), len(co_text_all)), dtype=bool) co_text_all = [(i/6).astype(int) for i in co_text_all] @@ -3683,11 +3668,9 @@ class Eynollah: else: org_contours_indexes.extend([indexes_of_located_cont[region_with_curr_order]]) - region_ids = ['region_%04d' % i for i in range(len(co_text_all_org))] - return org_contours_indexes, region_ids + return org_contours_indexes else: - region_ids = ['region_%04d' % i for i in range(len(co_text_all_org))] - return ordered, region_ids + return ordered def return_start_and_end_of_common_text_of_textline_ocr(self,textline_image, ind_tot): width = np.shape(textline_image)[1] @@ -4222,7 +4205,6 @@ class Eynollah: order_text_new = [0] slopes =[0] - id_of_texts_tot =['region_0001'] conf_contours_textregions =[0] if self.ocr and not self.tr: @@ -4234,7 +4216,7 @@ class Eynollah: ocr_all_textlines = None pcgts = self.writer.build_pagexml_no_full_layout( - cont_page, page_coord, order_text_new, id_of_texts_tot, + cont_page, page_coord, order_text_new, all_found_textline_polygons, page_coord, [], [], [], [], [], [], [], slopes, [], [], @@ -4736,14 +4718,14 @@ class Eynollah: self.logger.info("Headers ignored in reading order") if self.reading_order_machine_based: - order_text_new, id_of_texts_tot = self.do_order_of_regions_with_model( + order_text_new = self.do_order_of_regions_with_model( contours_only_text_parent, contours_only_text_parent_h, text_regions_p) else: if np.abs(slope_deskew) < SLOPE_THRESHOLD: - order_text_new, id_of_texts_tot = self.do_order_of_regions( + order_text_new = self.do_order_of_regions( contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot) else: - order_text_new, id_of_texts_tot = self.do_order_of_regions( + order_text_new = self.do_order_of_regions( contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered, boxes_d, textline_mask_tot_d) self.logger.info(f"Detection of reading order took {time.time() - t_order:.1f}s") @@ -4840,7 +4822,7 @@ class Eynollah: if self.full_layout: pcgts = self.writer.build_pagexml_full_layout( - contours_only_text_parent, contours_only_text_parent_h, page_coord, order_text_new, id_of_texts_tot, + contours_only_text_parent, contours_only_text_parent_h, page_coord, order_text_new, all_found_textline_polygons, all_found_textline_polygons_h, all_box_coord, all_box_coord_h, polygons_of_images, contours_tables, polygons_of_drop_capitals, polygons_of_marginals_left, polygons_of_marginals_right, @@ -4853,7 +4835,7 @@ class Eynollah: conf_contours_textregions, conf_contours_textregions_h) else: pcgts = self.writer.build_pagexml_no_full_layout( - contours_only_text_parent, page_coord, order_text_new, id_of_texts_tot, + contours_only_text_parent, page_coord, order_text_new, all_found_textline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals_left, polygons_of_marginals_right, all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 570eefe..20766a8 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1158,6 +1158,7 @@ def order_of_regions(textline_mask, contours_main, contours_head, y_ref, x_ref): # cnt = (contours_main if type_ == 1 else contours_head)[idx] # col = 'red' if type_ == 1 else 'blue' # plt.scatter(cx - x_ref, cy - y_ref, 20, c=col, marker='o') + # plt.text(cx - x_ref, cy - y_ref, str(idx), c=col) # plt.gca().add_patch(patches.Polygon(cnt[:, 0] - [[x_ref, y_ref]], closed=False, fill=False, color=col)) # plt.title("box contours centered in %d:%d (red=main / blue=heading)" % (top, bot)) # plt.show() diff --git a/src/eynollah/writer.py b/src/eynollah/writer.py index 9c3456a..f8aff62 100644 --- a/src/eynollah/writer.py +++ b/src/eynollah/writer.py @@ -89,7 +89,7 @@ class EynollahXmlWriter: def build_pagexml_no_full_layout( self, found_polygons_text_region, - page_coord, order_of_texts, id_of_texts, + page_coord, order_of_texts, all_found_textline_polygons, all_box_coord, found_polygons_text_region_img, @@ -102,7 +102,7 @@ class EynollahXmlWriter: **kwargs): return self.build_pagexml_full_layout( found_polygons_text_region, [], - page_coord, order_of_texts, id_of_texts, + page_coord, order_of_texts, all_found_textline_polygons, [], all_box_coord, [], found_polygons_text_region_img, found_polygons_tables, [], @@ -116,7 +116,7 @@ class EynollahXmlWriter: def build_pagexml_full_layout( self, found_polygons_text_region, found_polygons_text_region_h, - page_coord, order_of_texts, id_of_texts, + page_coord, order_of_texts, all_found_textline_polygons, all_found_textline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, From 49ab269e085505940a17c355905795d91777a451 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 15 Nov 2025 15:46:08 +0100 Subject: [PATCH 29/91] fix typos found by ruff --- src/eynollah/sbb_binarize.py | 2 +- src/eynollah/utils/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/eynollah/sbb_binarize.py b/src/eynollah/sbb_binarize.py index 0eab2ae..b81f45e 100644 --- a/src/eynollah/sbb_binarize.py +++ b/src/eynollah/sbb_binarize.py @@ -328,7 +328,7 @@ class SbbBinarizer: print(input_path, 'image_name') if os.path.exists(output_path): if overwrite: - self.logger.warning("will overwrite existing output file '%s'", output_ptah) + self.logger.warning("will overwrite existing output file '%s'", output_path) else: self.logger.warning("will skip input for existing output file '%s'", output_path) image = cv2.imread(input_path) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 20766a8..7be1fd0 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -146,7 +146,7 @@ def return_multicol_separators_x_start_end( args_emptysep.add(i) # print(j, i, "%d:%d" % (y_mid[j], y_mid[i]), "%d:%d" % (x_start[i], x_end[i]), "empty next sep") args_to_be_kept = [arg for arg in args_ysorted - if not arg in args_emptysep] + if arg not in args_emptysep] x_start = x_start[args_to_be_kept] x_end = x_end[args_to_be_kept] y_min = y_min[args_to_be_kept] From 028ed169212df4a1048b26d691e1edc53592f230 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 15 Nov 2025 17:17:37 +0100 Subject: [PATCH 30/91] adapt ocrd-sbb-binarize --- src/eynollah/ocrd_cli_binarization.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/eynollah/ocrd_cli_binarization.py b/src/eynollah/ocrd_cli_binarization.py index 848bbac..6289517 100644 --- a/src/eynollah/ocrd_cli_binarization.py +++ b/src/eynollah/ocrd_cli_binarization.py @@ -70,7 +70,7 @@ class SbbBinarizeProcessor(Processor): if oplevel == 'page': self.logger.info("Binarizing on 'page' level in page '%s'", page_id) - page_image_bin = cv2pil(self.binarizer.run(image=pil2cv(page_image), use_patches=True)) + page_image_bin = cv2pil(self.binarizer.run_single(image=pil2cv(page_image), use_patches=True)) # update PAGE (reference the image file): page_image_ref = AlternativeImageType(comments=page_xywh['features'] + ',binarized,clipped') page.add_AlternativeImage(page_image_ref) @@ -83,7 +83,7 @@ class SbbBinarizeProcessor(Processor): for region in regions: region_image, region_xywh = self.workspace.image_from_segment( region, page_image, page_xywh, feature_filter='binarized') - region_image_bin = cv2pil(self.binarizer.run(image=pil2cv(region_image), use_patches=True)) + region_image_bin = cv2pil(self.binarizer.run_single(image=pil2cv(region_image), use_patches=True)) # update PAGE (reference the image file): region_image_ref = AlternativeImageType(comments=region_xywh['features'] + ',binarized') region.add_AlternativeImage(region_image_ref) @@ -95,7 +95,7 @@ class SbbBinarizeProcessor(Processor): self.logger.warning("Page '%s' contains no text lines", page_id) for line in lines: line_image, line_xywh = self.workspace.image_from_segment(line, page_image, page_xywh, feature_filter='binarized') - line_image_bin = cv2pil(self.binarizer.run(image=pil2cv(line_image), use_patches=True)) + line_image_bin = cv2pil(self.binarizer.run_single(image=pil2cv(line_image), use_patches=True)) # update PAGE (reference the image file): line_image_ref = AlternativeImageType(comments=line_xywh['features'] + ',binarized') line.add_AlternativeImage(region_image_ref) From 406288b1fed020c2a68e20114ec51fe4d7f580f8 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 15 Nov 2025 20:13:58 +0100 Subject: [PATCH 31/91] fixup 72d059f3: forgot to update other writer calls --- src/eynollah/eynollah.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 6024646..46a1704 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -4164,7 +4164,7 @@ class Eynollah: image_page, page_coord, cont_page = \ self.get_regions_light_v_extract_only_images(img_res, is_image_enhanced, num_col_classifier) pcgts = self.writer.build_pagexml_no_full_layout( - [], page_coord, [], [], [], [], + [], page_coord, [], [], [], polygons_of_images, [], [], [], [], [], [], [], [], [], cont_page, [], []) if self.plotter: @@ -4282,7 +4282,7 @@ class Eynollah: self.logger.info("No columns detected - generating empty PAGE-XML") pcgts = self.writer.build_pagexml_no_full_layout( - [], page_coord, [], [], [], [], [], [], [], [], [], [], [], [], [], [], + [], page_coord, [], [], [], [], [], [], [], [], [], [], [], [], [], cont_page, [], []) return pcgts @@ -4529,7 +4529,7 @@ class Eynollah: empty_marginals = [[]] * len(polygons_of_marginals) if self.full_layout: pcgts = self.writer.build_pagexml_full_layout( - [], [], page_coord, [], [], [], [], [], [], + [], [], page_coord, [], [], [], [], [], polygons_of_images, contours_tables, [], polygons_of_marginals, polygons_of_marginals, empty_marginals, empty_marginals, @@ -4538,7 +4538,7 @@ class Eynollah: cont_page, polygons_seplines) else: pcgts = self.writer.build_pagexml_no_full_layout( - [], page_coord, [], [], [], [], + [], page_coord, [], [], [], polygons_of_images, polygons_of_marginals, polygons_of_marginals, empty_marginals, empty_marginals, From e428e7ad78629d9d4a39fa9c49f88aa4c6244139 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 16 Nov 2025 12:17:29 +0100 Subject: [PATCH 32/91] ensure separators stay within image bounds --- src/eynollah/utils/__init__.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 7be1fd0..307d8f3 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1400,6 +1400,14 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, matrix_of_seps_ch = np.append( matrix_of_seps_ch, matrix_l_n, axis=0) + # ensure no seps are out of bounds + matrix_of_seps_ch[:, 1] = np.maximum(np.minimum(matrix_of_seps_ch[:, 1], region_pre_p.shape[1]), 0) + matrix_of_seps_ch[:, 2] = np.maximum(matrix_of_seps_ch[:, 2], 0) + matrix_of_seps_ch[:, 3] = np.minimum(matrix_of_seps_ch[:, 3], region_pre_p.shape[1]) + matrix_of_seps_ch[:, 5] = np.maximum(np.minimum(matrix_of_seps_ch[:, 5], region_pre_p.shape[0]), 0) + matrix_of_seps_ch[:, 6] = np.maximum(matrix_of_seps_ch[:, 6], 0) + matrix_of_seps_ch[:, 7] = np.minimum(matrix_of_seps_ch[:, 7], region_pre_p.shape[0]) + cy_seps_splitters=cy_seps_hor[(x_min_seps_hor<=.16*region_pre_p.shape[1]) & (x_max_seps_hor>=.84*region_pre_p.shape[1])] cy_seps_splitters = np.append(cy_seps_splitters, special_separators) @@ -1621,7 +1629,7 @@ def return_boxes_of_images_by_order_of_reading_new( starting = xmin - peaks_neg_tot min_start = np.flatnonzero(starting >= 0)[-1] # last left-of ending = xmax - peaks_neg_tot - max_end = np.flatnonzero(ending < 0)[0] # first right-of + max_end = np.flatnonzero(ending <= 0)[0] # first right-of # skip elongation unless this is already a multi-column separator/heading: if not max_end - min_start > 1: continue From ee59a6809dedd175fc47a159e6a274f7f43dd534 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 28 Nov 2025 16:17:09 +0100 Subject: [PATCH 33/91] contours_in_same_horizon: fix 5d15941b --- src/eynollah/utils/contour.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/eynollah/utils/contour.py b/src/eynollah/utils/contour.py index 052688c..393acdd 100644 --- a/src/eynollah/utils/contour.py +++ b/src/eynollah/utils/contour.py @@ -20,7 +20,7 @@ def contours_in_same_horizon(cy_main_hor): by index into the array. """ sort = np.argsort(cy_main_hor) - same = np.diff(cy_main_hor[sort] <= 20) + same = np.diff(cy_main_hor[sort]) <= 20 # groups = np.split(sort, np.arange(len(cy_main_hor) - 1)[~same] + 1) same = np.flatnonzero(same) return np.stack((sort[:-1][same], sort[1:][same])).T From 38d91673b11fb6dde03b98325d2dca2ef282310a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 28 Nov 2025 16:50:08 +0100 Subject: [PATCH 34/91] combine_hor_lines_and_delete_cross_points: get external contours instead of tree without looking at the actual hierarchy (to prevent retrieving holes as separators) --- src/eynollah/utils/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 307d8f3..1934f10 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1180,7 +1180,7 @@ def combine_hor_lines_and_delete_cross_points_and_get_lines_features_back_new( #img_p_in_ver = cv2.erode(img_p_in_ver, self.kernel, iterations=2) _, thresh = cv2.threshold(img_p_in_ver, 0, 255, 0) - contours_lines_ver, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + contours_lines_ver, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) slope_lines_ver, _, x_min_main_ver, _, _, _, y_min_main_ver, y_max_main_ver, cx_main_ver = \ find_features_of_lines(contours_lines_ver) for i in range(len(x_min_main_ver)): @@ -1194,7 +1194,7 @@ def combine_hor_lines_and_delete_cross_points_and_get_lines_features_back_new( int(cx_main_ver[i])+25] = 0 _, thresh = cv2.threshold(img_in_hor, 0, 255, 0) - contours_lines_hor, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + contours_lines_hor, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) slope_lines_hor, dist_x_hor, x_min_main_hor, x_max_main_hor, cy_main_hor, _, _, _, _ = \ find_features_of_lines(contours_lines_hor) From 06cb9d1d3184ebf35d524305785fbe28b1d9c3f8 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 28 Nov 2025 17:08:39 +0100 Subject: [PATCH 35/91] combine_hor_lines_and_delete_cross_points: fix 1-off px bug when eroding the vertical separator mask (by slicing), avoid leaving 1px strips --- src/eynollah/utils/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 1934f10..345d438 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1189,7 +1189,7 @@ def combine_hor_lines_and_delete_cross_points_and_get_lines_features_back_new( int(cx_main_ver[i])-25: int(cx_main_ver[i])+25] = 0 img_p_in_ver[int(y_max_main_ver[i])-30: - int(y_max_main_ver[i]), + int(y_max_main_ver[i]+1), int(cx_main_ver[i])-25: int(cx_main_ver[i])+25] = 0 From 5c12b6a8513b202fb97e1ccb93854a906aab6677 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 28 Nov 2025 17:27:12 +0100 Subject: [PATCH 36/91] combine_hor_lines_and_delete_cross_points: simplify and rename MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - `x_width_smaller_than_acolumn_width` → `avg_col_width` - `len_lines_bigger_than_x_width_smaller_than_acolumn_width` → `nseps_wider_than_than_avg_col_width` - `img_in_hor` → `img_p_in_hor` (analogous to vertical) --- src/eynollah/utils/__init__.py | 52 +++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 16 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 345d438..0f9dcaf 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1176,7 +1176,23 @@ def order_of_regions(textline_mask, contours_main, contours_head, y_ref, x_ref): return np.array(final_indexers_sorted), np.array(final_types), np.array(final_index_type) def combine_hor_lines_and_delete_cross_points_and_get_lines_features_back_new( - img_p_in_ver, img_in_hor,num_col_classifier): + img_p_in_ver: np.ndarray, + img_p_in_hor: np.ndarray, + num_col_classifier: int, +) -> Tuple[np.ndarray, List[float]]: + """ + Given a horizontal and vertical separator mask, combine horizontal separators + (where possible) and make sure they do not cross each other. + + Arguments: + * img_p_in_ver: mask of vertical separators + * img_p_in_hor: mask of horizontal separators + * num_col_classifier: predicted (expected) number of columns + + Returns: a tuple of + * the final horizontal separators + * the y coordinates with horizontal separators spanning the full width + """ #img_p_in_ver = cv2.erode(img_p_in_ver, self.kernel, iterations=2) _, thresh = cv2.threshold(img_p_in_ver, 0, 255, 0) @@ -1192,20 +1208,26 @@ def combine_hor_lines_and_delete_cross_points_and_get_lines_features_back_new( int(y_max_main_ver[i]+1), int(cx_main_ver[i])-25: int(cx_main_ver[i])+25] = 0 + height, width = img_p_in_ver.shape - _, thresh = cv2.threshold(img_in_hor, 0, 255, 0) + _, thresh = cv2.threshold(img_p_in_hor, 0, 255, 0) contours_lines_hor, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) - slope_lines_hor, dist_x_hor, x_min_main_hor, x_max_main_hor, cy_main_hor, _, _, _, _ = \ - find_features_of_lines(contours_lines_hor) - x_width_smaller_than_acolumn_width=img_in_hor.shape[1]/float(num_col_classifier+1.) + (slope_lines_hor, + dist_x_hor, + x_min_main_hor, + x_max_main_hor, + cy_main_hor, _, + y_min_main_hor, + y_max_main_hor, + _) = find_features_of_lines(contours_lines_hor) - len_lines_bigger_than_x_width_smaller_than_acolumn_width=len( dist_x_hor[dist_x_hor>=x_width_smaller_than_acolumn_width] ) - len_lines_bigger_than_x_width_smaller_than_acolumn_width_per_column=int(len_lines_bigger_than_x_width_smaller_than_acolumn_width / - float(num_col_classifier)) - if len_lines_bigger_than_x_width_smaller_than_acolumn_width_per_column < 10: + avg_col_width = width / float(num_col_classifier + 1) + nseps_wider_than_than_avg_col_width = np.count_nonzero(dist_x_hor>=avg_col_width) + if nseps_wider_than_than_avg_col_width < 10 * num_col_classifier: args_hor=np.arange(len(slope_lines_hor)) sep_pairs=contours_in_same_horizon(cy_main_hor) + img_p_in = np.copy(img_p_in_hor) if len(sep_pairs): special_separators=[] contours_new=[] @@ -1242,21 +1264,19 @@ def combine_hor_lines_and_delete_cross_points_and_get_lines_features_back_new( # np.var( dist_x_hor[some_args] ),'jalibdiha') special_separators.append(np.mean(cy_main_hor[some_args])) else: - img_p_in=img_in_hor - special_separators=[] + img_p_in = img_p_in_hor + special_separators = [] img_p_in_ver[img_p_in_ver == 255] = 1 - sep_ver_hor = img_p_in + img_p_in_ver - sep_ver_hor_cross = (sep_ver_hor == 2) * 1 - _, thresh = cv2.threshold(sep_ver_hor_cross.astype(np.uint8), 0, 255, 0) + sep_ver_hor_cross = 255 * ((img_p_in > 0) & (img_p_in_ver > 0)) contours_cross, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) center_cross = np.array(find_center_of_contours(contours_cross), dtype=int) for cx, cy in center_cross.T: img_p_in[cy - 30: cy + 30, cx + 5: cx + 40] = 0 img_p_in[cy - 30: cy + 30, cx - 40: cx - 4] = 0 else: - img_p_in=np.copy(img_in_hor) - special_separators=[] + img_p_in = np.copy(img_p_in_hor) + special_separators = [] return img_p_in, special_separators def return_points_with_boundies(peaks_neg_fin, first_point, last_point): From a527d7a10d50ff68af888ed66aba30c53d46520b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 28 Nov 2025 17:34:11 +0100 Subject: [PATCH 37/91] combine_hor_lines_and_delete_cross_points: improve MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - avoid unnecessary `fillPoly` (we already have the mask) - do not merge hseps if vseps interfere - remove old criterion (based on total length of hseps) - create new criterion (no x overlap and x close to each other) - rename identifiers: * `sum_dis` → `sum_xspan` * `diff_max_min_uniques` → `tot_xspan` * np.std / np.mean → `dev_xspan` - remove rule cutting around the center of crossing seps (which is unnecessary and creates small isolated seps at the center, unrelated to the actual crossing points) - create rule cutting hseps by vseps _prior_ to merging --- src/eynollah/utils/__init__.py | 61 ++++++++++++++++++++++------------ 1 file changed, 40 insertions(+), 21 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 0f9dcaf..765d5b1 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1194,6 +1194,9 @@ def combine_hor_lines_and_delete_cross_points_and_get_lines_features_back_new( * the y coordinates with horizontal separators spanning the full width """ + # cut horizontal seps by vertical seps + img_p_in_hor[img_p_in_ver > 0] = 0 + #img_p_in_ver = cv2.erode(img_p_in_ver, self.kernel, iterations=2) _, thresh = cv2.threshold(img_p_in_ver, 0, 255, 0) contours_lines_ver, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) @@ -1237,24 +1240,34 @@ def combine_hor_lines_and_delete_cross_points_and_get_lines_features_back_new( some_cy=cy_main_hor[pair] some_x_min=x_min_main_hor[pair] some_x_max=x_max_main_hor[pair] + some_y_min=y_min_main_hor[pair] + some_y_max=y_max_main_hor[pair] + if np.any(img_p_in_ver[some_y_min.min(): some_y_max.max(), + some_x_max.min(): some_x_min.max()]): + # print("horizontal pair cut by vertical sep", pair, some_args, some_cy, + # "%d:%d" % (some_x_min[0], some_x_max[0]), + # "%d:%d" % (some_x_min[1], some_x_max[1])) + continue #img_in=np.zeros(separators_closeup_n[:,:,2].shape) #print(img_p_in_ver.shape[1],some_x_max-some_x_min,'xdiff') - diff_x_some=some_x_max-some_x_min - for jv in range(len(some_args)): - img_p_in=cv2.fillPoly(img_in_hor, pts=[contours_lines_hor[some_args[jv]]], color=(1,1,1)) - if any(i_diff>(img_p_in_ver.shape[1]/float(3.3)) for i_diff in diff_x_some): - img_p_in[int(np.mean(some_cy))-5: - int(np.mean(some_cy))+5, - int(np.min(some_x_min)): - int(np.max(some_x_max)) ]=1 - sum_dis=dist_x_hor[some_args].sum() - diff_max_min_uniques=np.max(x_max_main_hor[some_args])-np.min(x_min_main_hor[some_args]) + sum_xspan = dist_x_hor[some_args].sum() + tot_xspan = np.max(x_max_main_hor[some_args]) - np.min(x_min_main_hor[some_args]) + dev_xspan = np.std(dist_x_hor[some_args]) / np.mean(dist_x_hor[some_args]) + if (tot_xspan > sum_xspan and # no x overlap + sum_xspan > 0.85 * tot_xspan): # x close to each other + # print("merging horizontal pair", pair, some_args, some_cy, + # "%d:%d" % (some_x_min[0], some_x_max[0]), + # "%d:%d" % (some_x_min[1], some_x_max[1])) + img_p_in[int(np.mean(some_cy)) - 5: + int(np.mean(some_cy)) + 5, + np.min(some_x_min): + np.max(some_x_max)] = 255 - if (diff_max_min_uniques > sum_dis and - sum_dis / float(diff_max_min_uniques) > 0.85 and - diff_max_min_uniques / float(img_p_in_ver.shape[1]) > 0.85 and - np.std(dist_x_hor[some_args]) < 0.55 * np.mean(dist_x_hor[some_args])): + if (tot_xspan > sum_xspan and # no x overlap + sum_xspan > 0.85 * tot_xspan and # x close to each other + tot_xspan > 0.85 * width and # nearly full width + dev_xspan < 0.55): # similar x span # print(dist_x_hor[some_args], # dist_x_hor[some_args].sum(), # np.min(x_min_main_hor[some_args]), @@ -1263,17 +1276,23 @@ def combine_hor_lines_and_delete_cross_points_and_get_lines_features_back_new( # np.std( dist_x_hor[some_args] ), # np.var( dist_x_hor[some_args] ),'jalibdiha') special_separators.append(np.mean(cy_main_hor[some_args])) + # print("special separator for midline", special_separators[-1]) + # plt.subplot(1, 2, 1, title='original horizontal (1) / vertical (2) seps') + # plt.imshow(1 * (img_p_in_hor > 0) + 2 * (img_p_in_ver > 0)) + # plt.subplot(1, 2, 2, title='extended horizontal seps') + # plt.imshow(img_p_in) + # plt.show() else: img_p_in = img_p_in_hor special_separators = [] - img_p_in_ver[img_p_in_ver == 255] = 1 - sep_ver_hor_cross = 255 * ((img_p_in > 0) & (img_p_in_ver > 0)) - contours_cross, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - center_cross = np.array(find_center_of_contours(contours_cross), dtype=int) - for cx, cy in center_cross.T: - img_p_in[cy - 30: cy + 30, cx + 5: cx + 40] = 0 - img_p_in[cy - 30: cy + 30, cx - 40: cx - 4] = 0 + #img_p_in_ver[img_p_in_ver == 255] = 1 + # sep_ver_hor_cross = 255 * ((img_p_in > 0) & (img_p_in_ver > 0)) + # contours_cross, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + # center_cross = np.array(find_center_of_contours(contours_cross), dtype=int) + # for cx, cy in center_cross.T: + # img_p_in[cy - 30: cy + 30, cx + 5: cx + 40] = 0 + # img_p_in[cy - 30: cy + 30, cx - 40: cx - 4] = 0 else: img_p_in = np.copy(img_p_in_hor) special_separators = [] From b71bb80e3ad9afa8f94c64af9dc73ee6269c5cae Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 28 Nov 2025 17:53:27 +0100 Subject: [PATCH 38/91] return_boxes_of_images_by_order_of_reading_new: fix 4abc2ff5 (forgot to also flip `regions_with_separators` if right2left) --- src/eynollah/utils/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 765d5b1..1aecd11 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1508,7 +1508,7 @@ def return_boxes_of_images_by_order_of_reading_new( * splitter_y_new: the y coordinates separating the parts * regions_without_separators: (text) region mask with separators suppressed; (needed to find per-part columns and to combine separators if possible) - * regions_with_separators: (full) region map with separators suppressed; + * regions_with_separators: (full) region map with separators included; (needed to elongate separators if possible) * matrix_of_seps: type and coordinates of horizontal and vertical separators, as well as headings @@ -1525,6 +1525,7 @@ def return_boxes_of_images_by_order_of_reading_new( if right2left_readingorder: regions_without_separators = cv2.flip(regions_without_separators,1) + regions_with_separators = cv2.flip(regions_with_separators,1) if logger is None: logger = getLogger(__package__) logger.debug('enter return_boxes_of_images_by_order_of_reading_new') From 5abf0c1097e76a038d451a78a785f08fa4e897bf Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 28 Nov 2025 17:58:44 +0100 Subject: [PATCH 39/91] return_boxes_of_images_by_order_of_reading_new: improve - when analysing regions spanning across columns, disregard tiny regions (smaller than half the median size) - if a region spans across columns just by a tiny fraction, and therefore is not good enough for a multi-col separator, then it should also not be good enough for a multi-col box maker --- src/eynollah/utils/__init__.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 1aecd11..bf2ec15 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1720,6 +1720,7 @@ def return_boxes_of_images_by_order_of_reading_new( # analyse connected components of regions to gain additional separators # and prepare a map for cross-column boxes ccounts = np.bincount(ccomps[top: bot].flatten()) + ccounts_median = np.median(ccounts) col_ccounts = np.stack([np.bincount(ccomps[top: bot, left: right].flatten(), minlength=ccounts.size) for left, right in pairwise(peaks_neg_tot)]) @@ -1727,6 +1728,9 @@ def return_boxes_of_images_by_order_of_reading_new( for label, label_count in enumerate(ccounts): if not label: continue + # ignore small labels for the purpose of finding multicol seps + if label_count < 0.5 * ccounts_median: + continue label_left, label_top, label_width, label_height, label_area = cstats[label] # if label_count < 0.9 * label_area: # # mostly not in this part of the page @@ -1738,15 +1742,15 @@ def return_boxes_of_images_by_order_of_reading_new( label_bot = label_top + label_height label_start = np.flatnonzero(peaks_neg_tot > label_left)[0] - 1 label_end = np.flatnonzero(peaks_neg_tot >= label_right)[0] + if label_end - label_start < 2: + continue + if np.count_nonzero(col_ccounts[:, label] > 0.1 * label_count) < 2: + continue # store as dict for multi-column boxes: for start in range(label_start, label_end): labelcolmap.setdefault(start, list()).append( (label_end, label_top, label_bot, sum(col_ccounts[start: label_end, label]))) # make additional separators: - if label_end - label_start < 2: - continue - if np.count_nonzero(col_ccounts[:, label] > 0.1 * label_count) < 2: - continue x_min_hor_some = np.append(x_min_hor_some, [label_left] * 2) x_max_hor_some = np.append(x_max_hor_some, [label_right] * 2) y_min_hor_some = np.append(y_min_hor_some, [label_top - 2, label_bot]) From 84d10962f3382fd912ca5acef7fcb3d395aad41a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 28 Nov 2025 18:04:12 +0100 Subject: [PATCH 40/91] return_boxes_of_images_by_order_of_reading_new: improve - when searching for multi-col box makers, pick the right-most allowable column, not the left-most --- src/eynollah/utils/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index bf2ec15..2ebf48a 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1796,7 +1796,7 @@ def return_boxes_of_images_by_order_of_reading_new( # "box height", (y_bot - y_top), # "label height", sum(regions_without_separators[ # y_top: y_bot, peaks_neg_tot[start + 1]])) - return min((last for last, l_top, l_bot, l_count in labelcolmap.get(start, []) + return max((last for last, l_top, l_bot, l_count in labelcolmap.get(start, []) # yield the right-most column that does not cut through # any regions in this horizontal span if y_top < l_bot and y_bot > l_top From 4dd40c542b3384322febf821c0c761bc9cb4dc46 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 28 Nov 2025 18:07:15 +0100 Subject: [PATCH 41/91] find_num_col: add optional criterion - sum of vertical separators when searching for gaps between text regions, consider the vertical separator mask (if given): add the vertical sum of vertical separators to the peak scores (making column detection more robust if still slighly skewed or partially obscured by multi-column regions, but fg seps are present) --- src/eynollah/utils/__init__.py | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 2ebf48a..0f2dac3 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -241,10 +241,13 @@ def find_num_col_deskew(regions_without_separators, sigma_, multiplier=3.8): z = gaussian_filter1d(regions_without_separators_0, sigma_) return np.std(z) -def find_num_col(regions_without_separators, num_col_classifier, tables, multiplier=3.8, unbalanced=False): +def find_num_col(regions_without_separators, num_col_classifier, tables, multiplier=3.8, unbalanced=False, vertical_separators=None): if not regions_without_separators.any(): return 0, [] + if vertical_separators is None: + vertical_separators = np.zeros_like(regions_without_separators) regions_without_separators_0 = regions_without_separators.sum(axis=0) + vertical_separators_0 = vertical_separators.sum(axis=0) # fig, (ax1, ax2) = plt.subplots(2, sharex=True) # ax1.imshow(regions_without_separators, aspect="auto") # ax2.plot(regions_without_separators_0) @@ -258,13 +261,12 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl first_nonzero = first_nonzero + 50 #+ 200 last_offmargin = len(regions_without_separators_0) - 170 #370 first_offmargin = 170 #370 + x = vertical_separators_0 y = regions_without_separators_0 # [first_nonzero:last_nonzero] - y_help = np.zeros(len(y) + 20) - y_help[10 : len(y) + 10] = y - x = np.arange(len(y)) - zneg_rev = -y_help + np.max(y_help) - zneg = np.zeros(len(zneg_rev) + 20) - zneg[10 : len(zneg_rev) + 10] = zneg_rev + y_help = np.pad(y, (10, 10), constant_values=(0, 0)) + zneg_rev = y.max() - y_help + zneg = np.pad(zneg_rev, (10, 10), constant_values=(0, 0)) + x = gaussian_filter1d(x, sigma_) z = gaussian_filter1d(y, sigma_) zneg = gaussian_filter1d(zneg, sigma_) @@ -333,6 +335,7 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl #np.mean(y[peaks_neg[0]:peaks_neg[-1]])-np.std(y[peaks_neg[0]:peaks_neg[-1]])/2.0 # extra criterion: fixed multiple of lowest gap height + # print("grenze", grenze, multiplier * (5 + np.min(interest_neg))) grenze = min(grenze, multiplier * (5 + np.min(interest_neg))) # print(interest_neg,'interest_neg') @@ -341,16 +344,22 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl # print(dis_talaei,'dis_talaei') # print(peaks_neg,'peaks_neg') # fig, (ax1, ax2) = plt.subplots(2, sharex=True) - # ax1.imshow(regions_without_separators, aspect="auto") + # ax1.imshow(regions_without_separators + 5 * vertical_separators, aspect="auto") # ax2.plot(z, color='red', label='z') # ax2.plot(zneg[20:], color='blue', label='zneg') + # ax2.plot(x, color='green', label='vsep') # ax2.scatter(peaks_neg, z[peaks_neg], color='red') # ax2.scatter(peaks_neg, zneg[20:][peaks_neg], color='blue') - # ax2.axhline(min_peaks_pos, color='red', label="min_peaks_pos") - # ax2.axhline(grenze, color='blue', label="grenze") + # ax2.axhline(min_peaks_pos, color='red') + # ax2.axhline(grenze, color='blue') + # ax2.annotate("min_peaks_pos", xy=(0, min_peaks_pos), color='red') + # ax2.annotate("grenze", xy=(0, grenze), color='blue') # ax2.text(0, grenze, "grenze") + # ax2.legend() # plt.show() + # print("vsep", x[peaks_neg]) + interest_neg = interest_neg - x[peaks_neg] interest_neg_fin = interest_neg[(interest_neg < grenze)] peaks_neg_fin = peaks_neg[(interest_neg < grenze)] From 5a3de3b42db5d92e7743e49c43315d0e98e679cd Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 28 Nov 2025 18:14:24 +0100 Subject: [PATCH 42/91] column detection: improve, aided by vseps whenever possible - `find_number_of_columns_in_document`: retain vertical separators and pass to `find_num_col` for each vertical split - `return_boxes_of_images_by_order_of_reading_new`: reconstruct the vertical separators from the segmentation mask and the separator bboxes; pass it on to `find_num_col` everywhere - `return_boxes_of_images_by_order_of_reading_new`: no need to try-catch `find_num_col` anymore - `return_boxes_of_images_by_order_of_reading_new`: when a vertical split has too few columns, * do not raise but lower the threshold `multiplier` responsible for allowing gaps as column boundaries * do not pass the `num_col_classifier` (i.e. expected number of resulting columns) of the entire page to the iterative `find_num_col` for each existing column, but only the portion of that span --- src/eynollah/utils/__init__.py | 97 ++++++++++++++++++++++++---------- 1 file changed, 68 insertions(+), 29 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 0f2dac3..43d5d75 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1,4 +1,4 @@ -from typing import Tuple +from typing import List, Tuple from logging import getLogger import time import math @@ -1315,7 +1315,35 @@ def return_points_with_boundies(peaks_neg_fin, first_point, last_point): peaks_neg_tot.append(last_point) return peaks_neg_tot -def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, label_seps, contours_h=None): +def find_number_of_columns_in_document( + region_pre_p: np.ndarray, + num_col_classifier: int, + tables: bool, + label_seps: int, + contours_h: List[np.ndarray] = None, + logger=None +) -> Tuple[int, List[int], np.ndarray, List[int], np.ndarray]: + """ + Extract vertical and horizontal separators, vertical splits and horizontal column boundaries on page. + + Arguments: + * region_pre_p: segmentation map of the page + * num_col_classifier: predicted (expected) number of columns of the page + * tables: whether tables may be present + * label_seps: segmentation map class label for separators + * contours_h: polygons of potential headings (serving as additional horizontal separators) + * logger + + Returns: a tuple of + * the actual number of columns found + * the x coordinates of the column boundaries + * an array of the separators (bounding boxes and types) + * the y coordinates of the page splits + * a mask of the separators + """ + if logger is None: + logger = getLogger(__package__) + separators_closeup = 1 * (region_pre_p == label_seps) separators_closeup[0:110] = 0 separators_closeup[-150:] = 0 @@ -1483,8 +1511,11 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, num_big_parts += 1 try: num_col, peaks_neg_fin = find_num_col(regions_without_separators[top: bot], - num_col_classifier, tables, multiplier=7.0) - # print("big part %d:%d has %d columns" % (top, bot, num_col + 1), peaks_neg_fin) + num_col_classifier, tables, + vertical_separators=1 * (vertical[top: bot] > 0), + multiplier=7.0) + logger.debug("big part %d:%d has %d columns", top, bot, num_col + 1) + # print(peaks_neg_fin) except: num_col = 0 peaks_neg_fin = [] @@ -1522,7 +1553,8 @@ def return_boxes_of_images_by_order_of_reading_new( * matrix_of_seps: type and coordinates of horizontal and vertical separators, as well as headings * num_col_classifier: predicted number of columns for the entire page - * erosion_hurts: bool + * erosion_hurts: whether region masks have already been eroded + (and thus gaps can be expected to be wider) * tables: bool * right2left_readingorder: whether to invert the default left-to-right order @@ -1578,6 +1610,12 @@ def return_boxes_of_images_by_order_of_reading_new( height_tot, width_tot = regions_without_separators.shape big_part = 22 * height_tot // 100 # percent height _, ccomps, cstats, _ = cv2.connectedComponentsWithStats(regions_without_separators.astype(np.uint8)) + args_ver = matrix_of_seps_ch[:, 9] == 1 + mask_ver = np.zeros_like(regions_without_separators, dtype=bool) + for i in np.flatnonzero(args_ver): + mask_ver[matrix_of_seps_ch[i, 6]: matrix_of_seps_ch[i, 7], + matrix_of_seps_ch[i, 2]: matrix_of_seps_ch[i, 3]] = True + vertical_seps = 1 * ((regions_with_separators == 6) & mask_ver) for top, bot in pairwise(splitter_y_new): # print("%d:%d" % (top, bot), 'i') # dbg_plt([0, None, top, bot], "image cut for y split %d:%d" % (top, bot)) @@ -1589,16 +1627,13 @@ def return_boxes_of_images_by_order_of_reading_new( #if (len(matrix_new[:,9][matrix_new[:,9]==1]) > 0 and # np.max(matrix_new[:,8][matrix_new[:,9]==1]) >= # 0.1 * (np.abs(bot-top))): - try: - num_col, peaks_neg_fin = find_num_col( - regions_without_separators[top:bot], - # we do not expect to get all columns in small parts (headings etc.): - num_col_classifier if bot - top >= big_part else 1, - tables, multiplier=6. if erosion_hurts else 7., - unbalanced=True) - except: - peaks_neg_fin=[] - num_col = 0 + num_col, peaks_neg_fin = find_num_col( + regions_without_separators[top:bot], + # we do not expect to get all columns in small parts (headings etc.): + num_col_classifier if bot - top >= big_part else 1, + tables, vertical_separators=vertical_seps[top: bot], + multiplier=6. if erosion_hurts else 7., + unbalanced=True) try: if ((len(peaks_neg_fin) + 1 < num_col_classifier or num_col_classifier == 6) and @@ -1606,12 +1641,18 @@ def return_boxes_of_images_by_order_of_reading_new( bot - top >= big_part): # found too few columns here #print('burda') + logger.debug("searching for more than %d columns in big part %d:%d", + len(peaks_neg_fin) + 1, top, bot) peaks_neg_fin_org = np.copy(peaks_neg_fin) #print("peaks_neg_fin_org", peaks_neg_fin_org) - if len(peaks_neg_fin)==0: + if len(peaks_neg_fin) == 0: num_col, peaks_neg_fin = find_num_col( regions_without_separators[top:bot], - num_col_classifier, tables, multiplier=3., unbalanced=True) + num_col_classifier, tables, + vertical_separators=vertical_seps[top: bot], + # try to be less strict (lower threshold than above) + multiplier=7. if erosion_hurts else 8., + unbalanced=True) #print(peaks_neg_fin,'peaks_neg_fin') peaks_neg_fin_early = [0] + peaks_neg_fin + [width_tot-1] @@ -1625,22 +1666,19 @@ def return_boxes_of_images_by_order_of_reading_new( # plt.plot(regions_without_separators[top:bot, left:right].sum(axis=0)) # plt.title("vertical projection (sum over y)") # plt.show() - try: - _, peaks_neg_fin1 = find_num_col( - regions_without_separators[top:bot, left:right], - num_col_classifier, tables, multiplier=7.) - except: - peaks_neg_fin1 = [] - try: - _, peaks_neg_fin2 = find_num_col( - regions_without_separators[top:bot, left:right], - num_col_classifier, tables, multiplier=5.) - except: - peaks_neg_fin2 = [] + # try to get more peaks with different multipliers + num_col_expected = round((right - left) / width_tot * num_col_classifier) + args = regions_without_separators[top:bot, left:right], num_col_expected, tables + kwargs = dict(vertical_separators=vertical_seps[top: bot, left:right]) + _, peaks_neg_fin1 = find_num_col(*args, **kwargs, multiplier=7.) + _, peaks_neg_fin2 = find_num_col(*args, **kwargs, multiplier=5.) if len(peaks_neg_fin1) >= len(peaks_neg_fin2): peaks_neg_fin = peaks_neg_fin1 else: peaks_neg_fin = peaks_neg_fin2 + # print(peaks_neg_fin) + logger.debug("found %d additional column boundaries in %d:%d", + len(peaks_neg_fin), left, right) # add offset to local result peaks_neg_fin = list(np.array(peaks_neg_fin) + left) #print(peaks_neg_fin,'peaks_neg_fin') @@ -1652,6 +1690,7 @@ def return_boxes_of_images_by_order_of_reading_new( #print(peaks_neg_fin_rev,'peaks_neg_fin_rev') if len(peaks_neg_fin_rev) >= len(peaks_neg_fin_org): + #print("found more peaks than at first glance", peaks_neg_fin_rev, peaks_neg_fin_org) peaks_neg_fin = peaks_neg_fin_rev else: peaks_neg_fin = peaks_neg_fin_org From adcea47bc05ccbdfa76c6059d5f66e4610e5ae41 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 28 Nov 2025 18:23:59 +0100 Subject: [PATCH 43/91] return_boxes_of_images_by_order_of_reading_new: always erode when passing the text region mask, do not apply erosion only if there are more than 2 columns, but iff `not erosion_hurts` (consistent with `find_num_col`'s expectations and making it as easy to find the column gaps on 1 and 2-column pages as on multi-column pages) --- src/eynollah/eynollah.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 46a1704..47198cb 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -2356,7 +2356,6 @@ class Eynollah: img_only_regions_with_sep = (prediction_regions_org_y == 1).astype(np.uint8) try: img_only_regions = cv2.erode(img_only_regions_with_sep[:,:], KERNEL, iterations=20) - _, _ = find_num_col(img_only_regions, num_col_classifier, self.tables, multiplier=6.0) img = resize_image(img_org, int(img_org.shape[0]), int(img_org.shape[1]*(1.2 if is_image_enhanced else 1))) prediction_regions_org = self.do_prediction(True, img, self.models["region"]) @@ -3138,7 +3137,7 @@ class Eynollah: #print(time.time()-t_0_box,'time box in 2') self.logger.info("num_col_classifier: %s", num_col_classifier) - if num_col_classifier >= 3: + if not erosion_hurts: if np.abs(slope_deskew) < SLOPE_THRESHOLD: regions_without_separators = regions_without_separators.astype(np.uint8) regions_without_separators = cv2.erode(regions_without_separators[:, :], KERNEL, iterations=6) @@ -3289,21 +3288,16 @@ class Eynollah: if np.abs(slope_deskew) < SLOPE_THRESHOLD: num_col, _, matrix_of_seps_ch, splitter_y_new, _ = find_number_of_columns_in_document( text_regions_p, num_col_classifier, self.tables, label_seps) - - if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - num_col_d, _, matrix_of_seps_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( - text_regions_p_d, num_col_classifier, self.tables, label_seps) - - if num_col_classifier>=3: - if np.abs(slope_deskew) < SLOPE_THRESHOLD: + if not erosion_hurts: regions_without_separators = regions_without_separators.astype(np.uint8) regions_without_separators = cv2.erode(regions_without_separators[:,:], KERNEL, iterations=6) - if np.abs(slope_deskew) >= SLOPE_THRESHOLD: + else: + num_col_d, _, matrix_of_seps_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( + text_regions_p_d, num_col_classifier, self.tables, label_seps) + if not erosion_hurts: regions_without_separators_d = regions_without_separators_d.astype(np.uint8) regions_without_separators_d = cv2.erode(regions_without_separators_d[:,:], KERNEL, iterations=6) - else: - pass if np.abs(slope_deskew) < SLOPE_THRESHOLD: boxes, peaks_neg_tot_tables = return_boxes_of_images_by_order_of_reading_new( @@ -4149,6 +4143,7 @@ class Eynollah: self.run_enhancement(self.light_version) self.logger.info(f"Image: {self.image.shape[1]}x{self.image.shape[0]}, " + f"scale {self.scale_x:.1f}x{self.scale_y:.1f}, " f"{self.dpi} DPI, {num_col_classifier} columns") if is_image_enhanced: self.logger.info("Enhancement applied") @@ -4682,7 +4677,7 @@ class Eynollah: _, _, matrix_of_seps_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( text_regions_p_d, num_col_classifier, self.tables, label_seps) - if num_col_classifier >= 3: + if not erosion_hurts: if np.abs(slope_deskew) < SLOPE_THRESHOLD: regions_without_separators = regions_without_separators.astype(np.uint8) regions_without_separators = cv2.erode(regions_without_separators[:, :], KERNEL, iterations=6) From 56e73bf72f412e5fb235a1c525834130a8932880 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 28 Nov 2025 18:27:58 +0100 Subject: [PATCH 44/91] deskewing: add a 2nd stage for precision after selecting the optimum angle on the original search range, narrow down around in the vicinity with half the range (adding computational costs, but gaining precision) --- src/eynollah/utils/separate_lines.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/eynollah/utils/separate_lines.py b/src/eynollah/utils/separate_lines.py index 22ef00d..7e415b5 100644 --- a/src/eynollah/utils/separate_lines.py +++ b/src/eynollah/utils/separate_lines.py @@ -1564,6 +1564,9 @@ def return_deskew_slop(img_patch_org, sigma_des,n_tot_angles=100, angle2, var2 = get_smallest_skew(img_resized, sigma_des, angles2, map=map, logger=logger, plotter=plotter) if var2 > var: angle = angle2 + # precision stage: + angles = np.linspace(angle - 2.5, angle + 2.5, n_tot_angles // 2) + angle, _ = get_smallest_skew(img_resized, sigma_des, angles, map=map, logger=logger, plotter=plotter) return angle def get_smallest_skew(img, sigma_des, angles, logger=None, plotter=None, map=map): From 43a95842bd0e4e29337b227183a231a8cf288646 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 2 Dec 2025 16:35:32 +0100 Subject: [PATCH 45/91] writer: also ensure validity after scaling --- src/eynollah/eynollah.py | 8 ++-- src/eynollah/writer.py | 93 +++++++++++++++------------------------- 2 files changed, 39 insertions(+), 62 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 47198cb..cceab31 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -1670,10 +1670,10 @@ class Eynollah: else: box = [0, 0, self.image.shape[1], self.image.shape[0]] cropped_page, page_coord = crop_image_inside_box(box, self.image) - cont_page.append(np.array([[page_coord[2], page_coord[0]], - [page_coord[3], page_coord[0]], - [page_coord[3], page_coord[1]], - [page_coord[2], page_coord[1]]])) + cont_page.append(np.array([[[page_coord[2], page_coord[0]]], + [[page_coord[3], page_coord[0]]], + [[page_coord[3], page_coord[1]]], + [[page_coord[2], page_coord[1]]]])) return cropped_page, page_coord, cont_page def early_page_for_num_of_column_classification(self,img_bin): diff --git a/src/eynollah/writer.py b/src/eynollah/writer.py index f8aff62..2e9c895 100644 --- a/src/eynollah/writer.py +++ b/src/eynollah/writer.py @@ -3,10 +3,10 @@ from pathlib import Path import os.path import xml.etree.ElementTree as ET -from .utils.xml import create_page_xml, xml_reading_order -from .utils.counter import EynollahIdCounter +import numpy as np +from shapely import affinity, clip_by_rect -from ocrd_utils import getLogger +from ocrd_utils import getLogger, points_from_polygon from ocrd_models.ocrd_page import ( BorderType, CoordsType, @@ -19,7 +19,10 @@ from ocrd_models.ocrd_page import ( SeparatorRegionType, to_xml ) -import numpy as np + +from .utils.xml import create_page_xml, xml_reading_order +from .utils.counter import EynollahIdCounter +from .utils.contour import contour2polygon, make_valid class EynollahXmlWriter: @@ -41,20 +44,14 @@ class EynollahXmlWriter: def image_filename_stem(self): return Path(Path(self.image_filename).name).stem - def calculate_page_coords(self, cont_page): - self.logger.debug('enter calculate_page_coords') - points_page_print = "" - for _, contour in enumerate(cont_page[0]): - if len(contour) == 2: - points_page_print += str(int((contour[0]) / self.scale_x)) - points_page_print += ',' - points_page_print += str(int((contour[1]) / self.scale_y)) - else: - points_page_print += str(int((contour[0][0]) / self.scale_x)) - points_page_print += ',' - points_page_print += str(int((contour[0][1] ) / self.scale_y)) - points_page_print = points_page_print + ' ' - return points_page_print[:-1] + def calculate_points(self, contour, offset=None): + self.logger.debug('enter calculate_points') + poly = contour2polygon(contour) + if offset is not None: + poly = affinity.translate(poly, *offset) + poly = affinity.scale(poly, xfact=1 / self.scale_x, yfact=1 / self.scale_y, origin=(0, 0)) + poly = make_valid(clip_by_rect(poly, 0, 0, self.width_org, self.height_org)) + return points_from_polygon(poly.exterior.coords[:-1]) def serialize_lines_in_region(self, text_region, all_found_textline_polygons, region_idx, page_coord, all_box_coord, slopes, counter, ocr_all_textlines_textregion): self.logger.debug('enter serialize_lines_in_region') @@ -67,20 +64,12 @@ class EynollahXmlWriter: text_region.add_TextLine(textline) text_region.set_orientation(-slopes[region_idx]) region_bboxes = all_box_coord[region_idx] - points_co = '' - for point in polygon_textline: - if len(point) != 2: - point = point[0] - point_x = point[0] + page_coord[2] - point_y = point[1] + page_coord[0] - # FIXME: or actually... not self.textline_light and not self.curved_line or np.abs(slopes[region_idx]) > 45? - if not self.textline_light and not (self.curved_line and np.abs(slopes[region_idx]) <= 45): - point_x += region_bboxes[2] - point_y += region_bboxes[0] - point_x = max(0, int(point_x / self.scale_x)) - point_y = max(0, int(point_y / self.scale_y)) - points_co += str(point_x) + ',' + str(point_y) + ' ' - coords.set_points(points_co[:-1]) + offset = [page_coord[2], page_coord[0]] + # FIXME: or actually... not self.textline_light and not self.curved_line or np.abs(slopes[region_idx]) > 45? + if not self.textline_light and not (self.curved_line and np.abs(slopes[region_idx]) <= 45): + offset[0] += region_bboxes[2] + offset[1] += region_bboxes[0] + coords.set_points(self.calculate_points(polygon_textline, offset)) def write_pagexml(self, pcgts): self.logger.info("output filename: '%s'", self.output_filename) @@ -135,8 +124,13 @@ class EynollahXmlWriter: # create the file structure pcgts = self.pcgts if self.pcgts else create_page_xml(self.image_filename, self.height_org, self.width_org) page = pcgts.get_Page() - page.set_Border(BorderType(Coords=CoordsType(points=self.calculate_page_coords(cont_page)))) + if len(cont_page): + page.set_Border(BorderType(Coords=CoordsType(points=self.calculate_points(cont_page[0])))) + if skip_layout_reading_order: + offset = None + else: + offset = [page_coord[2], page_coord[0]] counter = EynollahIdCounter() if len(order_of_texts): _counter_marginals = EynollahIdCounter(region_idx=len(order_of_texts)) @@ -149,8 +143,7 @@ class EynollahXmlWriter: for mm, region_contour in enumerate(found_polygons_text_region): textregion = TextRegionType( id=counter.next_region_id, type_='paragraph', - Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord, - skip_layout_reading_order)) + Coords=CoordsType(points=self.calculate_points(region_contour, offset)) ) if conf_contours_textregions: textregion.Coords.set_conf(conf_contours_textregions[mm]) @@ -166,7 +159,7 @@ class EynollahXmlWriter: for mm, region_contour in enumerate(found_polygons_text_region_h): textregion = TextRegionType( id=counter.next_region_id, type_='heading', - Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)) + Coords=CoordsType(points=self.calculate_points(region_contour, offset)) ) if conf_contours_textregions_h: textregion.Coords.set_conf(conf_contours_textregions_h[mm]) @@ -181,7 +174,7 @@ class EynollahXmlWriter: for mm, region_contour in enumerate(found_polygons_marginals_left): marginal = TextRegionType( id=counter.next_region_id, type_='marginalia', - Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)) + Coords=CoordsType(points=self.calculate_points(region_contour, offset)) ) page.add_TextRegion(marginal) if ocr_all_textlines_marginals_left: @@ -193,7 +186,7 @@ class EynollahXmlWriter: for mm, region_contour in enumerate(found_polygons_marginals_right): marginal = TextRegionType( id=counter.next_region_id, type_='marginalia', - Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)) + Coords=CoordsType(points=self.calculate_points(region_contour, offset)) ) page.add_TextRegion(marginal) if ocr_all_textlines_marginals_right: @@ -206,7 +199,7 @@ class EynollahXmlWriter: for mm, region_contour in enumerate(found_polygons_drop_capitals): dropcapital = TextRegionType( id=counter.next_region_id, type_='drop-capital', - Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)) + Coords=CoordsType(points=self.calculate_points(region_contour, offset)) ) page.add_TextRegion(dropcapital) all_box_coord_drop = [[0, 0, 0, 0]] @@ -221,33 +214,17 @@ class EynollahXmlWriter: for region_contour in found_polygons_text_region_img: page.add_ImageRegion( ImageRegionType(id=counter.next_region_id, - Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)))) + Coords=CoordsType(points=self.calculate_points(region_contour, offset)))) for region_contour in polygons_seplines: page.add_SeparatorRegion( SeparatorRegionType(id=counter.next_region_id, - Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, [0, 0, 0, 0])))) + Coords=CoordsType(points=self.calculate_points(region_contour, None)))) for region_contour in found_polygons_tables: page.add_TableRegion( TableRegionType(id=counter.next_region_id, - Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)))) + Coords=CoordsType(points=self.calculate_points(region_contour, offset)))) return pcgts - def calculate_polygon_coords(self, contour, page_coord, skip_layout_reading_order=False): - self.logger.debug('enter calculate_polygon_coords') - coords = '' - for point in contour: - if len(point) != 2: - point = point[0] - point_x = point[0] - point_y = point[1] - if not skip_layout_reading_order: - point_x += page_coord[2] - point_y += page_coord[0] - point_x = int(point_x / self.scale_x) - point_y = int(point_y / self.scale_y) - coords += str(point_x) + ',' + str(point_y) + ' ' - return coords[:-1] - From ad8f8167c2d5bdc5c59d50a6a6eaf920b5e72c51 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 3 Dec 2025 00:58:26 +0100 Subject: [PATCH 46/91] separate_lines/_vertical: gen cv2-like contours (w/ ndim=3, as in all other places) --- src/eynollah/utils/separate_lines.py | 128 +++++++++++++-------------- 1 file changed, 64 insertions(+), 64 deletions(-) diff --git a/src/eynollah/utils/separate_lines.py b/src/eynollah/utils/separate_lines.py index 7e415b5..830dd8d 100644 --- a/src/eynollah/utils/separate_lines.py +++ b/src/eynollah/utils/separate_lines.py @@ -403,14 +403,14 @@ def separate_lines(img_patch, contour_text_interest, thetha, x_help, y_help): point_down_rot3=point_down_rot3-y_help point_down_rot4=point_down_rot4-y_help - textline_boxes_rot.append(np.array([[int(x_min_rot1), int(point_up_rot1)], - [int(x_max_rot2), int(point_up_rot2)], - [int(x_max_rot3), int(point_down_rot3)], - [int(x_min_rot4), int(point_down_rot4)]])) - textline_boxes.append(np.array([[int(x_min), int(point_up)], - [int(x_max), int(point_up)], - [int(x_max), int(point_down)], - [int(x_min), int(point_down)]])) + textline_boxes_rot.append(np.array([[[int(x_min_rot1), int(point_up_rot1)]], + [[int(x_max_rot2), int(point_up_rot2)]], + [[int(x_max_rot3), int(point_down_rot3)]], + [[int(x_min_rot4), int(point_down_rot4)]]])) + textline_boxes.append(np.array([[[int(x_min), int(point_up)]], + [[int(x_max), int(point_up)]], + [[int(x_max), int(point_down)]], + [[int(x_min), int(point_down)]]])) elif len(peaks) < 1: pass @@ -462,14 +462,14 @@ def separate_lines(img_patch, contour_text_interest, thetha, x_help, y_help): point_down_rot3=point_down_rot3-y_help point_down_rot4=point_down_rot4-y_help - textline_boxes_rot.append(np.array([[int(x_min_rot1), int(point_up_rot1)], - [int(x_max_rot2), int(point_up_rot2)], - [int(x_max_rot3), int(point_down_rot3)], - [int(x_min_rot4), int(point_down_rot4)]])) - textline_boxes.append(np.array([[int(x_min), int(y_min)], - [int(x_max), int(y_min)], - [int(x_max), int(y_max)], - [int(x_min), int(y_max)]])) + textline_boxes_rot.append(np.array([[[int(x_min_rot1), int(point_up_rot1)]], + [[int(x_max_rot2), int(point_up_rot2)]], + [[int(x_max_rot3), int(point_down_rot3)]], + [[int(x_min_rot4), int(point_down_rot4)]]])) + textline_boxes.append(np.array([[[int(x_min), int(y_min)]], + [[int(x_max), int(y_min)]], + [[int(x_max), int(y_max)]], + [[int(x_min), int(y_max)]]])) elif len(peaks) == 2: dis_to_next = np.abs(peaks[1] - peaks[0]) for jj in range(len(peaks)): @@ -530,14 +530,14 @@ def separate_lines(img_patch, contour_text_interest, thetha, x_help, y_help): point_down_rot3=point_down_rot3-y_help point_down_rot4=point_down_rot4-y_help - textline_boxes_rot.append(np.array([[int(x_min_rot1), int(point_up_rot1)], - [int(x_max_rot2), int(point_up_rot2)], - [int(x_max_rot3), int(point_down_rot3)], - [int(x_min_rot4), int(point_down_rot4)]])) - textline_boxes.append(np.array([[int(x_min), int(point_up)], - [int(x_max), int(point_up)], - [int(x_max), int(point_down)], - [int(x_min), int(point_down)]])) + textline_boxes_rot.append(np.array([[[int(x_min_rot1), int(point_up_rot1)]], + [[int(x_max_rot2), int(point_up_rot2)]], + [[int(x_max_rot3), int(point_down_rot3)]], + [[int(x_min_rot4), int(point_down_rot4)]]])) + textline_boxes.append(np.array([[[int(x_min), int(point_up)]], + [[int(x_max), int(point_up)]], + [[int(x_max), int(point_down)]], + [[int(x_min), int(point_down)]]])) else: for jj in range(len(peaks)): if jj == 0: @@ -606,14 +606,14 @@ def separate_lines(img_patch, contour_text_interest, thetha, x_help, y_help): point_down_rot3=point_down_rot3-y_help point_down_rot4=point_down_rot4-y_help - textline_boxes_rot.append(np.array([[int(x_min_rot1), int(point_up_rot1)], - [int(x_max_rot2), int(point_up_rot2)], - [int(x_max_rot3), int(point_down_rot3)], - [int(x_min_rot4), int(point_down_rot4)]])) - textline_boxes.append(np.array([[int(x_min), int(point_up)], - [int(x_max), int(point_up)], - [int(x_max), int(point_down)], - [int(x_min), int(point_down)]])) + textline_boxes_rot.append(np.array([[[int(x_min_rot1), int(point_up_rot1)]], + [[int(x_max_rot2), int(point_up_rot2)]], + [[int(x_max_rot3), int(point_down_rot3)]], + [[int(x_min_rot4), int(point_down_rot4)]]])) + textline_boxes.append(np.array([[[int(x_min), int(point_up)]], + [[int(x_max), int(point_up)]], + [[int(x_max), int(point_down)]], + [[int(x_min), int(point_down)]]])) return peaks, textline_boxes_rot def separate_lines_vertical(img_patch, contour_text_interest, thetha): @@ -785,14 +785,14 @@ def separate_lines_vertical(img_patch, contour_text_interest, thetha): if point_up_rot2 < 0: point_up_rot2 = 0 - textline_boxes_rot.append(np.array([[int(x_min_rot1), int(point_up_rot1)], - [int(x_max_rot2), int(point_up_rot2)], - [int(x_max_rot3), int(point_down_rot3)], - [int(x_min_rot4), int(point_down_rot4)]])) - textline_boxes.append(np.array([[int(x_min), int(point_up)], - [int(x_max), int(point_up)], - [int(x_max), int(point_down)], - [int(x_min), int(point_down)]])) + textline_boxes_rot.append(np.array([[[int(x_min_rot1), int(point_up_rot1)]], + [[int(x_max_rot2), int(point_up_rot2)]], + [[int(x_max_rot3), int(point_down_rot3)]], + [[int(x_min_rot4), int(point_down_rot4)]]])) + textline_boxes.append(np.array([[[int(x_min), int(point_up)]], + [[int(x_max), int(point_up)]], + [[int(x_max), int(point_down)]], + [[int(x_min), int(point_down)]]])) elif len(peaks) < 1: pass elif len(peaks) == 1: @@ -821,14 +821,14 @@ def separate_lines_vertical(img_patch, contour_text_interest, thetha): if point_up_rot2 < 0: point_up_rot2 = 0 - textline_boxes_rot.append(np.array([[int(x_min_rot1), int(point_up_rot1)], - [int(x_max_rot2), int(point_up_rot2)], - [int(x_max_rot3), int(point_down_rot3)], - [int(x_min_rot4), int(point_down_rot4)]])) - textline_boxes.append(np.array([[int(x_min), int(y_min)], - [int(x_max), int(y_min)], - [int(x_max), int(y_max)], - [int(x_min), int(y_max)]])) + textline_boxes_rot.append(np.array([[[int(x_min_rot1), int(point_up_rot1)]], + [[int(x_max_rot2), int(point_up_rot2)]], + [[int(x_max_rot3), int(point_down_rot3)]], + [[int(x_min_rot4), int(point_down_rot4)]]])) + textline_boxes.append(np.array([[[int(x_min), int(y_min)]], + [[int(x_max), int(y_min)]], + [[int(x_max), int(y_max)]], + [[int(x_min), int(y_max)]]])) elif len(peaks) == 2: dis_to_next = np.abs(peaks[1] - peaks[0]) for jj in range(len(peaks)): @@ -876,14 +876,14 @@ def separate_lines_vertical(img_patch, contour_text_interest, thetha): if point_up_rot2 < 0: point_up_rot2 = 0 - textline_boxes_rot.append(np.array([[int(x_min_rot1), int(point_up_rot1)], - [int(x_max_rot2), int(point_up_rot2)], - [int(x_max_rot3), int(point_down_rot3)], - [int(x_min_rot4), int(point_down_rot4)]])) - textline_boxes.append(np.array([[int(x_min), int(point_up)], - [int(x_max), int(point_up)], - [int(x_max), int(point_down)], - [int(x_min), int(point_down)]])) + textline_boxes_rot.append(np.array([[[int(x_min_rot1), int(point_up_rot1)]], + [[int(x_max_rot2), int(point_up_rot2)]], + [[int(x_max_rot3), int(point_down_rot3)]], + [[int(x_min_rot4), int(point_down_rot4)]]])) + textline_boxes.append(np.array([[[int(x_min), int(point_up)]], + [[int(x_max), int(point_up)]], + [[int(x_max), int(point_down)]], + [[int(x_min), int(point_down)]]])) else: for jj in range(len(peaks)): if jj == 0: @@ -942,14 +942,14 @@ def separate_lines_vertical(img_patch, contour_text_interest, thetha): if point_up_rot2 < 0: point_up_rot2 = 0 - textline_boxes_rot.append(np.array([[int(x_min_rot1), int(point_up_rot1)], - [int(x_max_rot2), int(point_up_rot2)], - [int(x_max_rot3), int(point_down_rot3)], - [int(x_min_rot4), int(point_down_rot4)]])) - textline_boxes.append(np.array([[int(x_min), int(point_up)], - [int(x_max), int(point_up)], - [int(x_max), int(point_down)], - [int(x_min), int(point_down)]])) + textline_boxes_rot.append(np.array([[[int(x_min_rot1), int(point_up_rot1)]], + [[int(x_max_rot2), int(point_up_rot2)]], + [[int(x_max_rot3), int(point_down_rot3)]], + [[int(x_min_rot4), int(point_down_rot4)]]])) + textline_boxes.append(np.array([[[int(x_min), int(point_up)]], + [[int(x_max), int(point_up)]], + [[int(x_max), int(point_down)]], + [[int(x_min), int(point_down)]]])) return peaks, textline_boxes_rot def separate_lines_new_inside_tiles2(img_patch, thetha): From 9fdae72e9620bd0ebd3bcef6bd8189fe8a003734 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 3 Dec 2025 03:04:46 +0100 Subject: [PATCH 47/91] utils_ocr.return_textline_contour: gen cv2-like contours (w/ ndim=3, as in all other places) --- src/eynollah/utils/utils_ocr.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/eynollah/utils/utils_ocr.py b/src/eynollah/utils/utils_ocr.py index 6e71b0f..fbe3611 100644 --- a/src/eynollah/utils/utils_ocr.py +++ b/src/eynollah/utils/utils_ocr.py @@ -369,8 +369,8 @@ def break_curved_line_into_small_pieces_and_then_merge(img_curved, mask_curved, return img_curved, img_bin_curved def return_textline_contour_with_added_box_coordinate(textline_contour, box_ind): - textline_contour[:,0] = textline_contour[:,0] + box_ind[2] - textline_contour[:,1] = textline_contour[:,1] + box_ind[0] + textline_contour[:,:,0] += box_ind[2] + textline_contour[:,:,1] += box_ind[0] return textline_contour From e2754da4f5f81ce34d5a21bf726741c27ac2aecf Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 20 Jan 2026 04:04:07 +0100 Subject: [PATCH 48/91] =?UTF-8?q?adapt=20to=20Numpy=201.25=20changes?= =?UTF-8?q?=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (esp. `np.array(...)` now not allowed on ragged arrays unless `dtype=object`, but then coercing sub-arrays to `object` as well) --- src/eynollah/eynollah.py | 22 +++++++++++++--------- src/eynollah/utils/__init__.py | 10 +++++++++- src/eynollah/utils/contour.py | 13 ++++++++----- 3 files changed, 30 insertions(+), 15 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index cceab31..c33b9f8 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -117,6 +117,7 @@ from .utils.marginals import get_marginals from .utils.resize import resize_image from .utils.shm import share_ndarray from .utils import ( + ensure_array, is_image_filename, boosting_headers_by_longshot_region_segmentation, crop_image_inside_box, @@ -2475,8 +2476,8 @@ class Eynollah: self, contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot): self.logger.debug("enter do_order_of_regions") - contours_only_text_parent = np.array(contours_only_text_parent) - contours_only_text_parent_h = np.array(contours_only_text_parent_h) + contours_only_text_parent = ensure_array(contours_only_text_parent) + contours_only_text_parent_h = ensure_array(contours_only_text_parent_h) boxes = np.array(boxes, dtype=int) # to be on the safe side c_boxes = np.stack((0.5 * boxes[:, 2:4].sum(axis=1), 0.5 * boxes[:, 0:2].sum(axis=1))) @@ -3987,7 +3988,7 @@ class Eynollah: def filterfun(lis): if len(lis) == 0: return [] - return list(np.array(lis)[indices]) + return list(ensure_array(lis)[indices]) return (filterfun(contours_par), filterfun(contours_textline), @@ -4378,7 +4379,8 @@ class Eynollah: areas_cnt_text = np.array([cv2.contourArea(c) for c in contours_only_text_parent]) areas_cnt_text = areas_cnt_text / float(areas_tot_text) #self.logger.info('areas_cnt_text %s', areas_cnt_text) - contours_only_text_parent = np.array(contours_only_text_parent)[areas_cnt_text > MIN_AREA_REGION] + contours_only_text_parent = ensure_array(contours_only_text_parent) + contours_only_text_parent = contours_only_text_parent[areas_cnt_text > MIN_AREA_REGION] areas_cnt_text_parent = areas_cnt_text[areas_cnt_text > MIN_AREA_REGION] index_con_parents = np.argsort(areas_cnt_text_parent) @@ -4397,12 +4399,13 @@ class Eynollah: areas_cnt_text_d = np.array([cv2.contourArea(c) for c in contours_only_text_parent_d]) areas_cnt_text_d = areas_cnt_text_d / float(areas_tot_text_d) - contours_only_text_parent_d = np.array(contours_only_text_parent_d)[areas_cnt_text_d > MIN_AREA_REGION] + contours_only_text_parent_d = ensure_array(contours_only_text_parent_d) + contours_only_text_parent_d = contours_only_text_parent_d[areas_cnt_text_d > MIN_AREA_REGION] areas_cnt_text_d = areas_cnt_text_d[areas_cnt_text_d > MIN_AREA_REGION] if len(contours_only_text_parent_d): index_con_parents_d = np.argsort(areas_cnt_text_d) - contours_only_text_parent_d = np.array(contours_only_text_parent_d)[index_con_parents_d] + contours_only_text_parent_d = contours_only_text_parent_d[index_con_parents_d] areas_cnt_text_d = areas_cnt_text_d[index_con_parents_d] centers_d = np.stack(find_center_of_contours(contours_only_text_parent_d)) # [2, N] @@ -4546,9 +4549,10 @@ class Eynollah: #print("text region early 3 in %.1fs", time.time() - t0) if self.light_version: contours_only_text_parent = dilate_textregion_contours(contours_only_text_parent) - contours_only_text_parent , contours_only_text_parent_d_ordered = self.filter_contours_inside_a_bigger_one( - contours_only_text_parent, contours_only_text_parent_d_ordered, text_only, - marginal_cnts=polygons_of_marginals) + contours_only_text_parent, contours_only_text_parent_d_ordered = \ + self.filter_contours_inside_a_bigger_one( + contours_only_text_parent, contours_only_text_parent_d_ordered, text_only, + marginal_cnts=polygons_of_marginals) #print("text region early 3.5 in %.1fs", time.time() - t0) conf_contours_textregions = get_textregion_contours_in_org_image_light( contours_only_text_parent, self.image, confidence_matrix) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 43d5d75..4e55aef 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1,4 +1,4 @@ -from typing import List, Tuple +from typing import Iterable, List, Tuple from logging import getLogger import time import math @@ -1929,3 +1929,11 @@ def is_image_filename(fname: str) -> bool: def is_xml_filename(fname: str) -> bool: return fname.lower().endswith('.xml') + +def ensure_array(obj: Iterable) -> np.ndarray: + """convert sequence to array of type `object` so items can be of heterogeneous shape + (but ensure not to convert inner arrays to `object` if len=1) + """ + if not isinstance(obj, np.ndarray): + return np.fromiter(obj, object) + return obj diff --git a/src/eynollah/utils/contour.py b/src/eynollah/utils/contour.py index 393acdd..7d01e74 100644 --- a/src/eynollah/utils/contour.py +++ b/src/eynollah/utils/contour.py @@ -12,6 +12,7 @@ from shapely import set_precision from shapely.ops import unary_union, nearest_points from .rotate import rotate_image, rotation_image_new +from . import ensure_array def contours_in_same_horizon(cy_main_hor): """ @@ -248,13 +249,15 @@ def return_contours_of_image(image): return contours, hierarchy def dilate_textline_contours(all_found_textline_polygons): - return [[polygon2contour(contour2polygon(contour, dilate=6)) - for contour in region] + return [ensure_array( + [polygon2contour(contour2polygon(contour, dilate=6)) + for contour in region]) for region in all_found_textline_polygons] -def dilate_textregion_contours(all_found_textline_polygons): - return [polygon2contour(contour2polygon(contour, dilate=6)) - for contour in all_found_textline_polygons] +def dilate_textregion_contours(all_found_textregion_polygons): + return ensure_array( + [polygon2contour(contour2polygon(contour, dilate=6)) + for contour in all_found_textregion_polygons]) def contour2polygon(contour: Union[np.ndarray, Sequence[Sequence[Sequence[Number]]]], dilate=0): polygon = Polygon([point[0] for point in contour]) From 3c3effcfda9b8d4dfd9dc8f685bb520fab1840b3 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 20 Jan 2026 04:18:55 +0100 Subject: [PATCH 49/91] =?UTF-8?q?drop=20TF1=20vernacular,=20relax=20TF/Ker?= =?UTF-8?q?as=20and=20Torch=20requirements=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - do not restrict TF version, but depend on tf-keras and set `TF_USE_LEGACY_KERAS=1` to avoid Keras 3 behaviour - relax Numpy version requirement up to v2 - relax Torch version requirement - drop TF1 session management code - drop TF1 config in favour of TF2 config code for memory growth - training.*: also simplify and limit line length - training.train: always train with TensorBoard callback --- requirements-ocr.txt | 2 +- requirements.txt | 5 +- src/eynollah/eynollah.py | 12 +- src/eynollah/sbb_binarize.py | 28 +- ..._model_load_pretrained_weights_and_save.py | 8 +- src/eynollah/training/inference.py | 192 ++++------ src/eynollah/training/train.py | 333 +++++++++++------- src/eynollah/utils/contour.py | 3 +- 8 files changed, 289 insertions(+), 294 deletions(-) diff --git a/requirements-ocr.txt b/requirements-ocr.txt index 9f31ebb..8f3b062 100644 --- a/requirements-ocr.txt +++ b/requirements-ocr.txt @@ -1,2 +1,2 @@ -torch <= 2.0.1 +torch transformers <= 4.30.2 diff --git a/requirements.txt b/requirements.txt index db1d7df..5699566 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,9 @@ # ocrd includes opencv, numpy, shapely, click ocrd >= 3.3.0 -numpy <1.24.0 +numpy < 2.0 scikit-learn >= 0.23.2 -tensorflow < 2.13 +tensorflow +tf-keras # avoid keras 3 (also needs TF_USE_LEGACY_KERAS=1) numba <= 0.58.1 scikit-image biopython diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index c33b9f8..4a83c0a 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -56,14 +56,12 @@ except ImportError: TrOCRProcessor = VisionEncoderDecoderModel = None #os.environ['CUDA_VISIBLE_DEVICES'] = '-1' +os.environ['TF_USE_LEGACY_KERAS'] = '1' # avoid Keras 3 after TF 2.15 tf_disable_interactive_logs() import tensorflow as tf -from tensorflow.python.keras import backend as K from tensorflow.keras.models import load_model tf.get_logger().setLevel("ERROR") warnings.filterwarnings("ignore") -# use tf1 compatibility for keras backend -from tensorflow.compat.v1.keras.backend import set_session from tensorflow.keras import layers from tensorflow.keras.layers import StringLookup @@ -277,14 +275,6 @@ class Eynollah: t_start = time.time() - # #gpu_options = tf.compat.v1.GPUOptions(allow_growth=True) - # #gpu_options = tf.compat.v1.GPUOptions(per_process_gpu_memory_fraction=7.7, allow_growth=True) - # #session = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(gpu_options=gpu_options)) - # config = tf.compat.v1.ConfigProto() - # config.gpu_options.allow_growth = True - # #session = tf.InteractiveSession() - # session = tf.compat.v1.Session(config=config) - # set_session(session) try: for device in tf.config.list_physical_devices('GPU'): tf.config.experimental.set_memory_growth(device, True) diff --git a/src/eynollah/sbb_binarize.py b/src/eynollah/sbb_binarize.py index b81f45e..2ca4a40 100644 --- a/src/eynollah/sbb_binarize.py +++ b/src/eynollah/sbb_binarize.py @@ -2,19 +2,19 @@ Tool to load model and binarize a given image. """ -import sys from glob import glob import os import logging +from PIL import Image import numpy as np -from PIL import Image import cv2 from ocrd_utils import tf_disable_interactive_logs + +os.environ['TF_USE_LEGACY_KERAS'] = '1' # avoid Keras 3 after TF 2.15 tf_disable_interactive_logs() import tensorflow as tf from tensorflow.keras.models import load_model -from tensorflow.python.keras import backend as tensorflow_backend from .utils import is_image_filename @@ -27,26 +27,17 @@ class SbbBinarizer: self.model_dir = model_dir self.logger = logger if logger else logging.getLogger('SbbBinarizer') - self.start_new_session() - - self.model_files = glob(self.model_dir+"/*/", recursive = True) + try: + for device in tf.config.list_physical_devices('GPU'): + tf.config.experimental.set_memory_growth(device, True) + except: + self.logger.warning("no GPU device available") + self.model_files = glob(self.model_dir + "/*/", recursive=True) self.models = [] for model_file in self.model_files: self.models.append(self.load_model(model_file)) - def start_new_session(self): - config = tf.compat.v1.ConfigProto() - config.gpu_options.allow_growth = True - - self.session = tf.compat.v1.Session(config=config) # tf.InteractiveSession() - tensorflow_backend.set_session(self.session) - - def end_session(self): - tensorflow_backend.clear_session() - self.session.close() - del self.session - def load_model(self, model_name): model = load_model(os.path.join(self.model_dir, model_name), compile=False) model_height = model.layers[len(model.layers)-1].output_shape[1] @@ -55,7 +46,6 @@ class SbbBinarizer: return model, model_height, model_width, n_classes def predict(self, model_in, img, use_patches, n_batch_inference=5): - tensorflow_backend.set_session(self.session) model, model_height, model_width, n_classes = model_in img_org_h = img.shape[0] diff --git a/src/eynollah/training/build_model_load_pretrained_weights_and_save.py b/src/eynollah/training/build_model_load_pretrained_weights_and_save.py index 40fc1fe..9fba66b 100644 --- a/src/eynollah/training/build_model_load_pretrained_weights_and_save.py +++ b/src/eynollah/training/build_model_load_pretrained_weights_and_save.py @@ -1,3 +1,4 @@ +import sys import click import tensorflow as tf @@ -5,8 +6,11 @@ from .models import resnet50_unet def configuration(): - gpu_options = tf.compat.v1.GPUOptions(allow_growth=True) - session = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(gpu_options=gpu_options)) + try: + for device in tf.config.list_physical_devices('GPU'): + tf.config.experimental.set_memory_growth(device, True) + except: + print("no GPU device available", file=sys.stderr) @click.command() def build_model_load_pretrained_weights_and_save(): diff --git a/src/eynollah/training/inference.py b/src/eynollah/training/inference.py index 3fa8fd6..15d1e6a 100644 --- a/src/eynollah/training/inference.py +++ b/src/eynollah/training/inference.py @@ -1,16 +1,19 @@ +""" +Tool to load model and predict for given image. +""" + import sys import os import warnings import json +import click import numpy as np import cv2 -from tensorflow.keras.models import load_model + +os.environ['TF_USE_LEGACY_KERAS'] = '1' # avoid Keras 3 after TF 2.15 import tensorflow as tf -from tensorflow.keras import backend as K -from tensorflow.keras.layers import * -import click -from tensorflow.python.keras import backend as tensorflow_backend +from tensorflow.keras.models import load_model import xml.etree.ElementTree as ET from .gt_gen_utils import ( @@ -24,17 +27,29 @@ from .models import ( PatchEncoder, Patches ) +from .metrics import ( + soft_dice_loss, + weighted_categorical_crossentropy, +) with warnings.catch_warnings(): warnings.simplefilter("ignore") -__doc__=\ -""" -Tool to load model and predict for given image. -""" +class SBBPredict: + def __init__(self, + image, + dir_in, + model, + task, + config_params_model, + patches, + save, + save_layout, + ground_truth, + xml_file, + out, + min_area): -class sbb_predict: - def __init__(self,image, dir_in, model, task, config_params_model, patches, save, save_layout, ground_truth, xml_file, out, min_area): self.image=image self.dir_in=dir_in self.patches=patches @@ -52,8 +67,9 @@ class sbb_predict: self.min_area = 0 def resize_image(self,img_in,input_height,input_width): - return cv2.resize( img_in, ( input_width,input_height) ,interpolation=cv2.INTER_NEAREST) - + return cv2.resize(img_in, (input_width, + input_height), + interpolation=cv2.INTER_NEAREST) def color_images(self,seg): ann_u=range(self.n_classes) @@ -69,68 +85,6 @@ class sbb_predict: seg_img[:,:,2][seg==c]=c return seg_img - def otsu_copy_binary(self,img): - img_r=np.zeros((img.shape[0],img.shape[1],3)) - img1=img[:,:,0] - - #print(img.min()) - #print(img[:,:,0].min()) - #blur = cv2.GaussianBlur(img,(5,5)) - #ret3,th3 = cv2.threshold(blur,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU) - retval1, threshold1 = cv2.threshold(img1, 0, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU) - - - - img_r[:,:,0]=threshold1 - img_r[:,:,1]=threshold1 - img_r[:,:,2]=threshold1 - #img_r=img_r/float(np.max(img_r))*255 - return img_r - - def otsu_copy(self,img): - img_r=np.zeros((img.shape[0],img.shape[1],3)) - #img1=img[:,:,0] - - #print(img.min()) - #print(img[:,:,0].min()) - #blur = cv2.GaussianBlur(img,(5,5)) - #ret3,th3 = cv2.threshold(blur,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU) - _, threshold1 = cv2.threshold(img[:,:,0], 0, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU) - _, threshold2 = cv2.threshold(img[:,:,1], 0, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU) - _, threshold3 = cv2.threshold(img[:,:,2], 0, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU) - - - - img_r[:,:,0]=threshold1 - img_r[:,:,1]=threshold2 - img_r[:,:,2]=threshold3 - ###img_r=img_r/float(np.max(img_r))*255 - return img_r - - def soft_dice_loss(self,y_true, y_pred, epsilon=1e-6): - - axes = tuple(range(1, len(y_pred.shape)-1)) - - numerator = 2. * K.sum(y_pred * y_true, axes) - - denominator = K.sum(K.square(y_pred) + K.square(y_true), axes) - return 1.00 - K.mean(numerator / (denominator + epsilon)) # average over classes and batch - - def weighted_categorical_crossentropy(self,weights=None): - - def loss(y_true, y_pred): - labels_floats = tf.cast(y_true, tf.float32) - per_pixel_loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels_floats,logits=y_pred) - - if weights is not None: - weight_mask = tf.maximum(tf.reduce_max(tf.constant( - np.array(weights, dtype=np.float32)[None, None, None]) - * labels_floats, axis=-1), 1.0) - per_pixel_loss = per_pixel_loss * weight_mask[:, :, :, None] - return tf.reduce_mean(per_pixel_loss) - return self.loss - - def IoU(self,Yi,y_predi): ## mean Intersection over Union ## Mean IoU = TP/(FN + TP + FP) @@ -157,30 +111,28 @@ class sbb_predict: return mIoU def start_new_session_and_model(self): - - config = tf.compat.v1.ConfigProto() - config.gpu_options.allow_growth = True + try: + for device in tf.config.list_physical_devices('GPU'): + tf.config.experimental.set_memory_growth(device, True) + except: + print("no GPU device available", file=sys.stderr) - session = tf.compat.v1.Session(config=config) # tf.InteractiveSession() - tensorflow_backend.set_session(session) #tensorflow.keras.layers.custom_layer = PatchEncoder #tensorflow.keras.layers.custom_layer = Patches - self.model = load_model(self.model_dir , compile=False,custom_objects = {"PatchEncoder": PatchEncoder, "Patches": Patches}) - #config = tf.ConfigProto() - #config.gpu_options.allow_growth=True - - #self.session = tf.InteractiveSession() - #keras.losses.custom_loss = self.weighted_categorical_crossentropy - #self.model = load_model(self.model_dir , compile=False) + self.model = load_model(self.model_dir, compile=False, + custom_objects={"PatchEncoder": PatchEncoder, + "Patches": Patches}) + #keras.losses.custom_loss = weighted_categorical_crossentropy + #self.model = load_model(self.model_dir, compile=False) - ##if self.weights_dir!=None: ##self.model.load_weights(self.weights_dir) if self.task != 'classification' and self.task != 'reading_order': - self.img_height=self.model.layers[len(self.model.layers)-1].output_shape[1] - self.img_width=self.model.layers[len(self.model.layers)-1].output_shape[2] - self.n_classes=self.model.layers[len(self.model.layers)-1].output_shape[3] + last = self.model.layers[-1] + self.img_height = last.output_shape[1] + self.img_width = last.output_shape[2] + self.n_classes = last.output_shape[3] def visualize_model_output(self, prediction, img, task): if task == "binarization": @@ -208,21 +160,16 @@ class sbb_predict: '15' : [255, 0, 255]} layout_only = np.zeros(prediction.shape) - for unq_class in unique_classes: + where = prediction[:,:,0]==unq_class rgb_class_unique = rgb_colors[str(int(unq_class))] - layout_only[:,:,0][prediction[:,:,0]==unq_class] = rgb_class_unique[0] - layout_only[:,:,1][prediction[:,:,0]==unq_class] = rgb_class_unique[1] - layout_only[:,:,2][prediction[:,:,0]==unq_class] = rgb_class_unique[2] - - + layout_only[:,:,0][where] = rgb_class_unique[0] + layout_only[:,:,1][where] = rgb_class_unique[1] + layout_only[:,:,2][where] = rgb_class_unique[2] + layout_only = layout_only.astype(np.int32) img = self.resize_image(img, layout_only.shape[0], layout_only.shape[1]) - - layout_only = layout_only.astype(np.int32) img = img.astype(np.int32) - - added_image = cv2.addWeighted(img,0.5,layout_only,0.1,0) @@ -231,10 +178,10 @@ class sbb_predict: def predict(self, image_dir): if self.task == 'classification': classes_names = self.config_params_model['classification_classes_name'] - img_1ch = img=cv2.imread(image_dir, 0) - - img_1ch = img_1ch / 255.0 - img_1ch = cv2.resize(img_1ch, (self.config_params_model['input_height'], self.config_params_model['input_width']), interpolation=cv2.INTER_NEAREST) + img_1ch = cv2.imread(image_dir, 0) / 255.0 + img_1ch = cv2.resize(img_1ch, (self.config_params_model['input_height'], + self.config_params_model['input_width']), + interpolation=cv2.INTER_NEAREST) img_in = np.zeros((1, img_1ch.shape[0], img_1ch.shape[1], 3)) img_in[0, :, :, 0] = img_1ch[:, :] img_in[0, :, :, 1] = img_1ch[:, :] @@ -244,23 +191,27 @@ class sbb_predict: index_class = np.argmax(label_p_pred[0]) print("Predicted Class: {}".format(classes_names[str(int(index_class))])) + elif self.task == 'reading_order': img_height = self.config_params_model['input_height'] img_width = self.config_params_model['input_width'] - tree_xml, root_xml, bb_coord_printspace, file_name, id_paragraph, id_header, co_text_paragraph, co_text_header, tot_region_ref, x_len, y_len, index_tot_regions, img_poly = read_xml(self.xml_file) - _, cy_main, x_min_main, x_max_main, y_min_main, y_max_main, _ = find_new_features_of_contours(co_text_header) + tree_xml, root_xml, bb_coord_printspace, file_name, \ + id_paragraph, id_header, \ + co_text_paragraph, co_text_header, \ + tot_region_ref, x_len, y_len, index_tot_regions, \ + img_poly = read_xml(self.xml_file) + _, cy_main, x_min_main, x_max_main, y_min_main, y_max_main, _ = \ + find_new_features_of_contours(co_text_header) img_header_and_sep = np.zeros((y_len,x_len), dtype='uint8') - - for j in range(len(cy_main)): - img_header_and_sep[int(y_max_main[j]):int(y_max_main[j])+12,int(x_min_main[j]):int(x_max_main[j]) ] = 1 - + img_header_and_sep[int(y_max_main[j]): int(y_max_main[j]) + 12, + int(x_min_main[j]): int(x_max_main[j])] = 1 + co_text_all = co_text_paragraph + co_text_header id_all_text = id_paragraph + id_header - ##texts_corr_order_index = [index_tot_regions[tot_region_ref.index(i)] for i in id_all_text ] ##texts_corr_order_index_int = [int(x) for x in texts_corr_order_index] texts_corr_order_index_int = list(np.array(range(len(co_text_all)))) @@ -271,7 +222,8 @@ class sbb_predict: #print(np.shape(co_text_all[0]), len( np.shape(co_text_all[0]) ),'co_text_all') #co_text_all = filter_contours_area_of_image_tables(img_poly, co_text_all, _, max_area, min_area) #print(co_text_all,'co_text_all') - co_text_all, texts_corr_order_index_int, _ = filter_contours_area_of_image(img_poly, co_text_all, texts_corr_order_index_int, max_area, self.min_area) + co_text_all, texts_corr_order_index_int, _ = filter_contours_area_of_image( + img_poly, co_text_all, texts_corr_order_index_int, max_area, self.min_area) #print(texts_corr_order_index_int) @@ -664,17 +616,15 @@ class sbb_predict: help="min area size of regions considered for reading order detection. The default value is zero and means that all text regions are considered for reading order.", ) def main(image, dir_in, model, patches, save, save_layout, ground_truth, xml_file, out, min_area): - assert image or dir_in, "Either a single image -i or a dir_in -di is required" + assert image or dir_in, "Either a single image -i or a dir_in -di input is required" with open(os.path.join(model,'config.json')) as f: config_params_model = json.load(f) task = config_params_model['task'] if task != 'classification' and task != 'reading_order': - if image and not save: - print("Error: You used one of segmentation or binarization task with image input but not set -s, you need a filename to save visualized output with -s") - sys.exit(1) - if dir_in and not out: - print("Error: You used one of segmentation or binarization task with dir_in but not set -out") - sys.exit(1) - x=sbb_predict(image, dir_in, model, task, config_params_model, patches, save, save_layout, ground_truth, xml_file, out, min_area) + assert not image or save, "For segmentation or binarization, an input single image -i also requires an output filename -s" + assert not dir_in or out, "For segmentation or binarization, an input directory -di also requires an output directory -o" + x = SBBPredict(image, dir_in, model, task, config_params_model, + patches, save, save_layout, ground_truth, xml_file, out, + min_area) x.run() diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index 97736e0..da901b0 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -28,14 +28,14 @@ from eynollah.training.utils import ( ) os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' +os.environ['TF_USE_LEGACY_KERAS'] = '1' # avoid Keras 3 after TF 2.15 import tensorflow as tf -from tensorflow.compat.v1.keras.backend import set_session from tensorflow.keras.optimizers import SGD, Adam -from sacred import Experiment from tensorflow.keras.models import load_model +from tensorflow.keras.callbacks import Callback, TensorBoard +from sacred import Experiment from tqdm import tqdm from sklearn.metrics import f1_score -from tensorflow.keras.callbacks import Callback import numpy as np import cv2 @@ -63,10 +63,11 @@ class SaveWeightsAfterSteps(Callback): def configuration(): - config = tf.compat.v1.ConfigProto() - config.gpu_options.allow_growth = True - session = tf.compat.v1.Session(config=config) - set_session(session) + try: + for device in tf.config.list_physical_devices('GPU'): + tf.config.experimental.set_memory_growth(device, True) + except: + print("no GPU device available", file=sys.stderr) def get_dirs_or_files(input_data): @@ -171,12 +172,11 @@ def run(_config, n_classes, n_epochs, input_height, else: list_all_possible_foreground_rgbs = None - if task == "segmentation" or task == "enhancement" or task == "binarization": + if task in ["segmentation", "enhancement", "binarization"]: if data_is_provided: dir_train_flowing = os.path.join(dir_output, 'train') dir_eval_flowing = os.path.join(dir_output, 'eval') - dir_flow_train_imgs = os.path.join(dir_train_flowing, 'images') dir_flow_train_labels = os.path.join(dir_train_flowing, 'labels') @@ -227,176 +227,228 @@ def run(_config, n_classes, n_epochs, input_height, segs_list_test=np.array(os.listdir(dir_seg_val)) # writing patches into a sub-folder in order to be flowed from directory. - provide_patches(imgs_list, segs_list, dir_img, dir_seg, dir_flow_train_imgs, - dir_flow_train_labels, input_height, input_width, blur_k, - blur_aug, padding_white, padding_black, flip_aug, binarization, adding_rgb_background,adding_rgb_foreground, add_red_textlines, channels_shuffling, - scaling, shifting, degrading, brightening, scales, degrade_scales, brightness, - flip_index,shuffle_indexes, scaling_bluring, scaling_brightness, scaling_binarization, - rotation, rotation_not_90, thetha, scaling_flip, task, augmentation=augmentation, - patches=patches, dir_img_bin=dir_img_bin,number_of_backgrounds_per_image=number_of_backgrounds_per_image,list_all_possible_background_images=list_all_possible_background_images, dir_rgb_backgrounds=dir_rgb_backgrounds, dir_rgb_foregrounds=dir_rgb_foregrounds,list_all_possible_foreground_rgbs=list_all_possible_foreground_rgbs) - - provide_patches(imgs_list_test, segs_list_test, dir_img_val, dir_seg_val, - dir_flow_eval_imgs, dir_flow_eval_labels, input_height, input_width, - blur_k, blur_aug, padding_white, padding_black, flip_aug, binarization, adding_rgb_background, adding_rgb_foreground, add_red_textlines, channels_shuffling, - scaling, shifting, degrading, brightening, scales, degrade_scales, brightness, - flip_index, shuffle_indexes, scaling_bluring, scaling_brightness, scaling_binarization, - rotation, rotation_not_90, thetha, scaling_flip, task, augmentation=False, patches=patches,dir_img_bin=dir_img_bin,number_of_backgrounds_per_image=number_of_backgrounds_per_image,list_all_possible_background_images=list_all_possible_background_images, dir_rgb_backgrounds=dir_rgb_backgrounds,dir_rgb_foregrounds=dir_rgb_foregrounds,list_all_possible_foreground_rgbs=list_all_possible_foreground_rgbs ) + common_args = [input_height, input_width, + blur_k, blur_aug, + padding_white, padding_black, + flip_aug, binarization, + adding_rgb_background, + adding_rgb_foreground, + add_red_textlines, + channels_shuffling, + scaling, shifting, degrading, brightening, + scales, degrade_scales, brightness, + flip_index, shuffle_indexes, + scaling_bluring, scaling_brightness, scaling_binarization, + rotation, rotation_not_90, thetha, + scaling_flip, task, + ] + common_kwargs = dict(patches= + patches, + dir_img_bin= + dir_img_bin, + number_of_backgrounds_per_image= + number_of_backgrounds_per_image, + list_all_possible_background_images= + list_all_possible_background_images, + dir_rgb_backgrounds= + dir_rgb_backgrounds, + dir_rgb_foregrounds= + dir_rgb_foregrounds, + list_all_possible_foreground_rgbs= + list_all_possible_foreground_rgbs, + ) + provide_patches(imgs_list, segs_list, + dir_img, dir_seg, + dir_flow_train_imgs, + dir_flow_train_labels, + *common_args, + augmentation=augmentation, + **common_kwargs) + provide_patches(imgs_list_test, segs_list_test, + dir_img_val, dir_seg_val, + dir_flow_eval_imgs, + dir_flow_eval_labels, + *common_args, + augmentation=False, + **common_kwargs) if weighted_loss: weights = np.zeros(n_classes) if data_is_provided: - for obj in os.listdir(dir_flow_train_labels): - try: - label_obj = cv2.imread(dir_flow_train_labels + '/' + obj) - label_obj_one_hot = get_one_hot(label_obj, label_obj.shape[0], label_obj.shape[1], n_classes) - weights += (label_obj_one_hot.sum(axis=0)).sum(axis=0) - except: - pass + dirs = dir_flow_train_labels else: - - for obj in os.listdir(dir_seg): - try: - label_obj = cv2.imread(dir_seg + '/' + obj) - label_obj_one_hot = get_one_hot(label_obj, label_obj.shape[0], label_obj.shape[1], n_classes) - weights += (label_obj_one_hot.sum(axis=0)).sum(axis=0) - except: - pass + dirs = dir_seg + for obj in os.listdir(dirs): + label_file = os.path.join(dirs, + obj) + try: + label_obj = cv2.imread(label_file) + label_obj_one_hot = get_one_hot(label_obj, label_obj.shape[0], label_obj.shape[1], n_classes) + weights += (label_obj_one_hot.sum(axis=0)).sum(axis=0) + except Exception as e: + print("error reading data file '%s': %s" % (label_file, e), file=sys.stderr) weights = 1.00 / weights - weights = weights / float(np.sum(weights)) weights = weights / float(np.min(weights)) weights = weights / float(np.sum(weights)) if continue_training: - if backbone_type=='nontransformer': - if is_loss_soft_dice and (task == "segmentation" or task == "binarization"): - model = load_model(dir_of_start_model, compile=True, custom_objects={'soft_dice_loss': soft_dice_loss}) - if weighted_loss and (task == "segmentation" or task == "binarization"): - model = load_model(dir_of_start_model, compile=True, custom_objects={'loss': weighted_categorical_crossentropy(weights)}) - if not is_loss_soft_dice and not weighted_loss: + if backbone_type == 'nontransformer': + if is_loss_soft_dice and task in ["segmentation", "binarization"]: + model = load_model(dir_of_start_model, compile=True, + custom_objects={'soft_dice_loss': soft_dice_loss}) + elif weighted_loss and task in ["segmentation", "binarization"]: + model = load_model(dir_of_start_model, compile=True, + custom_objects={'loss': weighted_categorical_crossentropy(weights)}) + else: model = load_model(dir_of_start_model , compile=True) - elif backbone_type=='transformer': - if is_loss_soft_dice and (task == "segmentation" or task == "binarization"): - model = load_model(dir_of_start_model, compile=True, custom_objects={"PatchEncoder": PatchEncoder, "Patches": Patches,'soft_dice_loss': soft_dice_loss}) - if weighted_loss and (task == "segmentation" or task == "binarization"): - model = load_model(dir_of_start_model, compile=True, custom_objects={'loss': weighted_categorical_crossentropy(weights)}) - if not is_loss_soft_dice and not weighted_loss: - model = load_model(dir_of_start_model , compile=True,custom_objects = {"PatchEncoder": PatchEncoder, "Patches": Patches}) + + elif backbone_type == 'transformer': + if is_loss_soft_dice and task in ["segmentation", "binarization"]: + model = load_model(dir_of_start_model, compile=True, + custom_objects={"PatchEncoder": PatchEncoder, + "Patches": Patches, + 'soft_dice_loss': soft_dice_loss}) + elif weighted_loss and task in ["segmentation", "binarization"]: + model = load_model(dir_of_start_model, compile=True, + custom_objects={'loss': weighted_categorical_crossentropy(weights)}) + else: + model = load_model(dir_of_start_model, compile=True, + custom_objects = {"PatchEncoder": PatchEncoder, + "Patches": Patches}) else: index_start = 0 - if backbone_type=='nontransformer': - model = resnet50_unet(n_classes, input_height, input_width, task, weight_decay, pretraining) - elif backbone_type=='transformer': + if backbone_type == 'nontransformer': + model = resnet50_unet(n_classes, + input_height, + input_width, + task, + weight_decay, + pretraining) + elif backbone_type == 'transformer': num_patches_x = transformer_num_patches_xy[0] num_patches_y = transformer_num_patches_xy[1] num_patches = num_patches_x * num_patches_y if transformer_cnn_first: - if input_height != (num_patches_y * transformer_patchsize_y * 32): - print("Error: transformer_patchsize_y or transformer_num_patches_xy height value error . input_height should be equal to ( transformer_num_patches_xy height value * transformer_patchsize_y * 32)") - sys.exit(1) - if input_width != (num_patches_x * transformer_patchsize_x * 32): - print("Error: transformer_patchsize_x or transformer_num_patches_xy width value error . input_width should be equal to ( transformer_num_patches_xy width value * transformer_patchsize_x * 32)") - sys.exit(1) - if (transformer_projection_dim % (transformer_patchsize_y * transformer_patchsize_x)) != 0: - print("Error: transformer_projection_dim error. The remainder when parameter transformer_projection_dim is divided by (transformer_patchsize_y*transformer_patchsize_x) should be zero") - sys.exit(1) - - - model = vit_resnet50_unet(n_classes, transformer_patchsize_x, transformer_patchsize_y, num_patches, transformer_mlp_head_units, transformer_layers, transformer_num_heads, transformer_projection_dim, input_height, input_width, task, weight_decay, pretraining) + model_builder = vit_resnet50_unet + multiple_of_32 = True else: - if input_height != (num_patches_y * transformer_patchsize_y): - print("Error: transformer_patchsize_y or transformer_num_patches_xy height value error . input_height should be equal to ( transformer_num_patches_xy height value * transformer_patchsize_y)") - sys.exit(1) - if input_width != (num_patches_x * transformer_patchsize_x): - print("Error: transformer_patchsize_x or transformer_num_patches_xy width value error . input_width should be equal to ( transformer_num_patches_xy width value * transformer_patchsize_x)") - sys.exit(1) - if (transformer_projection_dim % (transformer_patchsize_y * transformer_patchsize_x)) != 0: - print("Error: transformer_projection_dim error. The remainder when parameter transformer_projection_dim is divided by (transformer_patchsize_y*transformer_patchsize_x) should be zero") - sys.exit(1) - model = vit_resnet50_unet_transformer_before_cnn(n_classes, transformer_patchsize_x, transformer_patchsize_y, num_patches, transformer_mlp_head_units, transformer_layers, transformer_num_heads, transformer_projection_dim, input_height, input_width, task, weight_decay, pretraining) + model_builder = vit_resnet50_unet_transformer_before_cnn + multiple_of_32 = False + + assert input_height == num_patches_y * transformer_patchsize_y * (32 if multiple_of_32 else 1), \ + "transformer_patchsize_y or transformer_num_patches_xy height value error: " \ + "input_height should be equal to " \ + "(transformer_num_patches_xy height value * transformer_patchsize_y%s)" % \ + " * 32" if multiple_of_32 else "" + assert input_width == num_patches_x * transformer_patchsize_x * (32 if multiple_of_32 else 1), \ + "transformer_patchsize_x or transformer_num_patches_xy width value error: " \ + "input_width should be equal to " \ + "(transformer_num_patches_xy width value * transformer_patchsize_x%s)" % \ + " * 32" if multiple_of_32 else "" + assert 0 == transformer_projection_dim % (transformer_patchsize_y * transformer_patchsize_x), \ + "transformer_projection_dim error: " \ + "The remainder when parameter transformer_projection_dim is divided by " \ + "(transformer_patchsize_y*transformer_patchsize_x) should be zero" + + model = model_builder( + n_classes, + transformer_patchsize_x, + transformer_patchsize_y, + num_patches, + transformer_mlp_head_units, + transformer_layers, + transformer_num_heads, + transformer_projection_dim, + input_height, + input_width, + task, + weight_decay, + pretraining) #if you want to see the model structure just uncomment model summary. model.summary() - - if task == "segmentation" or task == "binarization": - if not is_loss_soft_dice and not weighted_loss: - model.compile(loss='categorical_crossentropy', - optimizer=Adam(learning_rate=learning_rate), metrics=['accuracy']) + if task in ["segmentation", "binarization"]: if is_loss_soft_dice: - model.compile(loss=soft_dice_loss, - optimizer=Adam(learning_rate=learning_rate), metrics=['accuracy']) - if weighted_loss: - model.compile(loss=weighted_categorical_crossentropy(weights), - optimizer=Adam(learning_rate=learning_rate), metrics=['accuracy']) - elif task == "enhancement": - model.compile(loss='mean_squared_error', - optimizer=Adam(learning_rate=learning_rate), metrics=['accuracy']) - + loss = soft_dice_loss + elif weighted_loss: + loss = weighted_categorical_crossentropy(weights) + else: + loss = 'categorical_crossentropy' + else: # task == "enhancement" + loss = 'mean_squared_error' + model.compile(loss=loss, + optimizer=Adam(learning_rate=learning_rate), + metrics=['accuracy']) # generating train and evaluation data - train_gen = data_gen(dir_flow_train_imgs, dir_flow_train_labels, batch_size=n_batch, - input_height=input_height, input_width=input_width, n_classes=n_classes, task=task) - val_gen = data_gen(dir_flow_eval_imgs, dir_flow_eval_labels, batch_size=n_batch, - input_height=input_height, input_width=input_width, n_classes=n_classes, task=task) - + gen_kwargs = dict(batch_size=n_batch, + input_height=input_height, + input_width=input_width, + n_classes=n_classes, + task=task) + train_gen = data_gen(dir_flow_train_imgs, dir_flow_train_labels, **gen_kwargs) + val_gen = data_gen(dir_flow_eval_imgs, dir_flow_eval_labels, **gen_kwargs) + ##img_validation_patches = os.listdir(dir_flow_eval_imgs) ##score_best=[] ##score_best.append(0) + callbacks = [TensorBoard(os.path.join(dir_output, 'logs'), write_graph=False)] if save_interval: - save_weights_callback = SaveWeightsAfterSteps(save_interval, dir_output, _config) - + callbacks.append(SaveWeightsAfterSteps(save_interval, dir_output, _config)) for i in tqdm(range(index_start, n_epochs + index_start)): - if save_interval: - model.fit( - train_gen, - steps_per_epoch=int(len(os.listdir(dir_flow_train_imgs)) / n_batch) - 1, - validation_data=val_gen, - validation_steps=1, - epochs=1, callbacks=[save_weights_callback]) - else: - model.fit( - train_gen, - steps_per_epoch=int(len(os.listdir(dir_flow_train_imgs)) / n_batch) - 1, - validation_data=val_gen, - validation_steps=1, - epochs=1) - - model.save(os.path.join(dir_output,'model_'+str(i))) - - with open(os.path.join(os.path.join(dir_output,'model_'+str(i)),"config.json"), "w") as fp: + model.fit( + train_gen, + steps_per_epoch=int(len(os.listdir(dir_flow_train_imgs)) / n_batch) - 1, + validation_data=val_gen, + validation_steps=1, + epochs=1, + callbacks=callbacks) + + dir_model = os.path.join(dir_output, 'model_' + str(i)) + model.save(dir_model) + with open(os.path.join(dir_model, "config.json"), "w") as fp: json.dump(_config, fp) # encode dict into JSON #os.system('rm -rf '+dir_train_flowing) #os.system('rm -rf '+dir_eval_flowing) #model.save(dir_output+'/'+'model'+'.h5') + elif task=='classification': configuration() - model = resnet50_classifier(n_classes, input_height, input_width, weight_decay, pretraining) + model = resnet50_classifier(n_classes, + input_height, + input_width, + weight_decay, + pretraining) - opt_adam = Adam(learning_rate=0.001) model.compile(loss='categorical_crossentropy', - optimizer = opt_adam,metrics=['accuracy']) - + optimizer=Adam(learning_rate=0.001), # rs: why not learning_rate? + metrics=['accuracy']) list_classes = list(classification_classes_name.values()) - testX, testY = generate_data_from_folder_evaluation(dir_eval, input_height, input_width, n_classes, list_classes) - - y_tot=np.zeros((testX.shape[0],n_classes)) + trainXY = generate_data_from_folder_training( + dir_train, n_batch, input_height, input_width, n_classes, list_classes) + testX, testY = generate_data_from_folder_evaluation( + dir_eval, input_height, input_width, n_classes, list_classes) + y_tot = np.zeros((testX.shape[0], n_classes)) score_best= [0] - num_rows = return_number_of_total_training_data(dir_train) weights=[] + callbacks = [TensorBoard(os.path.join(dir_output, 'logs'), write_graph=False)] for i in range(n_epochs): - history = model.fit( generate_data_from_folder_training(dir_train, n_batch , input_height, input_width, n_classes, list_classes), steps_per_epoch=num_rows / n_batch, verbose=1)#,class_weight=weights) - + history = model.fit(trainXY, + steps_per_epoch=num_rows / n_batch, + #class_weight=weights) + verbose=1, + callbacks=callbacks) y_pr_class = [] for jj in range(testY.shape[0]): y_pr=model.predict(testX[jj,:,:,:].reshape(1,input_height,input_width,3), verbose=0) @@ -433,7 +485,8 @@ def run(_config, n_classes, n_epochs, input_height, elif task=='reading_order': configuration() - model = machine_based_reading_order_model(n_classes,input_height,input_width,weight_decay,pretraining) + model = machine_based_reading_order_model( + n_classes, input_height, input_width, weight_decay, pretraining) dir_flow_train_imgs = os.path.join(dir_train, 'images') dir_flow_train_labels = os.path.join(dir_train, 'labels') @@ -447,20 +500,26 @@ def run(_config, n_classes, n_epochs, input_height, #f1score_tot = [0] indexer_start = 0 - # opt = SGD(learning_rate=0.01, momentum=0.9) - opt_adam = tf.keras.optimizers.Adam(learning_rate=0.0001) model.compile(loss="binary_crossentropy", - optimizer = opt_adam,metrics=['accuracy']) + #optimizer=SGD(learning_rate=0.01, momentum=0.9), + optimizer=Adam(learning_rate=0.0001), # rs: why not learning_rate? + metrics=['accuracy']) + callbacks = [TensorBoard(os.path.join(dir_output, 'logs'), write_graph=False)] if save_interval: - save_weights_callback = SaveWeightsAfterSteps(save_interval, dir_output, _config) - + callbacks.append(SaveWeightsAfterSteps(save_interval, dir_output, _config)) + + trainXY = generate_arrays_from_folder_reading_order( + dir_flow_train_labels, dir_flow_train_imgs, + n_batch, input_height, input_width, n_classes, + thetha, augmentation) + for i in range(n_epochs): - if save_interval: - history = model.fit(generate_arrays_from_folder_reading_order(dir_flow_train_labels, dir_flow_train_imgs, n_batch, input_height, input_width, n_classes, thetha, augmentation), steps_per_epoch=num_rows / n_batch, verbose=1, callbacks=[save_weights_callback]) - else: - history = model.fit(generate_arrays_from_folder_reading_order(dir_flow_train_labels, dir_flow_train_imgs, n_batch, input_height, input_width, n_classes, thetha, augmentation), steps_per_epoch=num_rows / n_batch, verbose=1) - model.save( os.path.join(dir_output,'model_'+str(i+indexer_start) )) + history = model.fit(trainXY, + steps_per_epoch=num_rows / n_batch, + verbose=1, + callbacks=callbacks) + model.save(os.path.join(dir_output, 'model_'+str(i+indexer_start) )) with open(os.path.join(os.path.join(dir_output,'model_'+str(i)),"config.json"), "w") as fp: json.dump(_config, fp) # encode dict into JSON diff --git a/src/eynollah/utils/contour.py b/src/eynollah/utils/contour.py index 7d01e74..c8caca9 100644 --- a/src/eynollah/utils/contour.py +++ b/src/eynollah/utils/contour.py @@ -12,7 +12,6 @@ from shapely import set_precision from shapely.ops import unary_union, nearest_points from .rotate import rotate_image, rotation_image_new -from . import ensure_array def contours_in_same_horizon(cy_main_hor): """ @@ -249,12 +248,14 @@ def return_contours_of_image(image): return contours, hierarchy def dilate_textline_contours(all_found_textline_polygons): + from . import ensure_array return [ensure_array( [polygon2contour(contour2polygon(contour, dilate=6)) for contour in region]) for region in all_found_textline_polygons] def dilate_textregion_contours(all_found_textregion_polygons): + from . import ensure_array return ensure_array( [polygon2contour(contour2polygon(contour, dilate=6)) for contour in all_found_textregion_polygons]) From 87d7ffbdd84283f0e2e6dca23d4d05431cf8bb3f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 22 Jan 2026 11:25:00 +0100 Subject: [PATCH 50/91] training: use proper Keras callbacks and top-level loop --- ..._model_load_pretrained_weights_and_save.py | 10 -- src/eynollah/training/gt_gen_utils.py | 1 + src/eynollah/training/models.py | 3 + src/eynollah/training/train.py | 168 ++++++++---------- train/requirements.txt | 2 +- 5 files changed, 84 insertions(+), 100 deletions(-) diff --git a/src/eynollah/training/build_model_load_pretrained_weights_and_save.py b/src/eynollah/training/build_model_load_pretrained_weights_and_save.py index 9fba66b..15eaf64 100644 --- a/src/eynollah/training/build_model_load_pretrained_weights_and_save.py +++ b/src/eynollah/training/build_model_load_pretrained_weights_and_save.py @@ -1,17 +1,9 @@ import sys import click -import tensorflow as tf from .models import resnet50_unet -def configuration(): - try: - for device in tf.config.list_physical_devices('GPU'): - tf.config.experimental.set_memory_growth(device, True) - except: - print("no GPU device available", file=sys.stderr) - @click.command() def build_model_load_pretrained_weights_and_save(): n_classes = 2 @@ -21,8 +13,6 @@ def build_model_load_pretrained_weights_and_save(): pretraining = False dir_of_weights = 'model_bin_sbb_ens.h5' - # configuration() - model = resnet50_unet(n_classes, input_height, input_width, weight_decay, pretraining) model.load_weights(dir_of_weights) model.save('./name_in_another_python_version.h5') diff --git a/src/eynollah/training/gt_gen_utils.py b/src/eynollah/training/gt_gen_utils.py index 2e3428b..b7c35ee 100644 --- a/src/eynollah/training/gt_gen_utils.py +++ b/src/eynollah/training/gt_gen_utils.py @@ -653,6 +653,7 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ num_col = int(text_comments.split('num_col')[1]) comment_is_sub_element = True if not comment_is_sub_element: + # FIXME: look in /Page/@custom as well num_col = None if num_col: diff --git a/src/eynollah/training/models.py b/src/eynollah/training/models.py index fdc5437..3b38fe8 100644 --- a/src/eynollah/training/models.py +++ b/src/eynollah/training/models.py @@ -1,3 +1,6 @@ +import os + +os.environ['TF_USE_LEGACY_KERAS'] = '1' # avoid Keras 3 after TF 2.15 import tensorflow as tf from tensorflow import keras from tensorflow.keras.models import * diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index da901b0..7ee63f9 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -32,7 +32,7 @@ os.environ['TF_USE_LEGACY_KERAS'] = '1' # avoid Keras 3 after TF 2.15 import tensorflow as tf from tensorflow.keras.optimizers import SGD, Adam from tensorflow.keras.models import load_model -from tensorflow.keras.callbacks import Callback, TensorBoard +from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard from sacred import Experiment from tqdm import tqdm from sklearn.metrics import f1_score @@ -40,26 +40,28 @@ from sklearn.metrics import f1_score import numpy as np import cv2 -class SaveWeightsAfterSteps(Callback): - def __init__(self, save_interval, save_path, _config): - super(SaveWeightsAfterSteps, self).__init__() - self.save_interval = save_interval - self.save_path = save_path - self.step_count = 0 +class SaveWeightsAfterSteps(ModelCheckpoint): + def __init__(self, save_interval, save_path, _config, **kwargs): + if save_interval: + # batches + super().__init__( + os.path.join(save_path, "model_step_{batch:04d}"), + save_freq=save_interval, + verbose=1, + **kwargs) + else: + super().__init__( + os.path.join(save_path, "model_{epoch:02d}"), + save_freq="epoch", + verbose=1, + **kwargs) self._config = _config - def on_train_batch_end(self, batch, logs=None): - self.step_count += 1 - - if self.step_count % self.save_interval ==0: - save_file = f"{self.save_path}/model_step_{self.step_count}" - #os.system('mkdir '+save_file) - - self.model.save(save_file) - - with open(os.path.join(os.path.join(self.save_path, f"model_step_{self.step_count}"),"config.json"), "w") as fp: - json.dump(self._config, fp) # encode dict into JSON - print(f"saved model as steps {self.step_count} to {save_file}") + # overwrite tf-keras (Keras 2) implementation to get our _config JSON in + def _save_handler(self, filepath): + super()._save_handler(filepath) + with open(os.path.join(filepath, "config.json"), "w") as fp: + json.dump(self._config, fp) # encode dict into JSON def configuration(): @@ -396,23 +398,19 @@ def run(_config, n_classes, n_epochs, input_height, ##score_best=[] ##score_best.append(0) - callbacks = [TensorBoard(os.path.join(dir_output, 'logs'), write_graph=False)] + callbacks = [TensorBoard(os.path.join(dir_output, 'logs'), write_graph=False), + SaveWeightsAfterSteps(0, dir_output, _config)] if save_interval: callbacks.append(SaveWeightsAfterSteps(save_interval, dir_output, _config)) - for i in tqdm(range(index_start, n_epochs + index_start)): - model.fit( - train_gen, - steps_per_epoch=int(len(os.listdir(dir_flow_train_imgs)) / n_batch) - 1, - validation_data=val_gen, - validation_steps=1, - epochs=1, - callbacks=callbacks) - - dir_model = os.path.join(dir_output, 'model_' + str(i)) - model.save(dir_model) - with open(os.path.join(dir_model, "config.json"), "w") as fp: - json.dump(_config, fp) # encode dict into JSON + model.fit( + train_gen, + steps_per_epoch=len(os.listdir(dir_flow_train_imgs)) // n_batch - 1, + validation_data=val_gen, + #validation_steps=1, # rs: only one batch?? + validation_steps=len(os.listdir(dir_flow_eval_imgs)) // n_batch - 1, + epochs=n_epochs, + callbacks=callbacks) #os.system('rm -rf '+dir_train_flowing) #os.system('rm -rf '+dir_eval_flowing) @@ -434,54 +432,49 @@ def run(_config, n_classes, n_epochs, input_height, list_classes = list(classification_classes_name.values()) trainXY = generate_data_from_folder_training( dir_train, n_batch, input_height, input_width, n_classes, list_classes) - testX, testY = generate_data_from_folder_evaluation( + testXY = generate_data_from_folder_evaluation( dir_eval, input_height, input_width, n_classes, list_classes) y_tot = np.zeros((testX.shape[0], n_classes)) - score_best= [0] num_rows = return_number_of_total_training_data(dir_train) - weights=[] - callbacks = [TensorBoard(os.path.join(dir_output, 'logs'), write_graph=False)] + callbacks = [TensorBoard(os.path.join(dir_output, 'logs'), write_graph=False), + SaveWeightsAfterSteps(0, dir_output, _config, + monitor='val_f1', + save_best_only=True, mode='max')] - for i in range(n_epochs): - history = model.fit(trainXY, - steps_per_epoch=num_rows / n_batch, - #class_weight=weights) - verbose=1, - callbacks=callbacks) - y_pr_class = [] - for jj in range(testY.shape[0]): - y_pr=model.predict(testX[jj,:,:,:].reshape(1,input_height,input_width,3), verbose=0) - y_pr_ind= np.argmax(y_pr,axis=1) - y_pr_class.append(y_pr_ind) - - y_pr_class = np.array(y_pr_class) - f1score=f1_score(np.argmax(testY,axis=1), y_pr_class, average='macro') - print(i,f1score) - - if f1score>score_best[0]: - score_best[0]=f1score - model.save(os.path.join(dir_output,'model_best')) - - if f1score > f1_threshold_classification: - weights.append(model.get_weights() ) - + history = model.fit(trainXY, + steps_per_epoch=num_rows / n_batch, + #class_weight=weights) + validation_data=testXY, + verbose=1, + epochs=n_epochs, + metrics=[F1Score(average='macro', name='f1')], + callbacks=callbacks) - if len(weights) >= 1: - new_weights=list() - for weights_list_tuple in zip(*weights): - new_weights.append( [np.array(weights_).mean(axis=0) for weights_ in zip(*weights_list_tuple)] ) + usable_checkpoints = np.flatnonzero(np.array(history['val_f1']) > f1_threshold_classification) + if len(usable_checkpoints) >= 1: + print("averaging over usable checkpoints", usable_checkpoints) + all_weights = [] + for epoch in usable_checkpoints: + cp_path = os.path.join(dir_output, 'model_{epoch:02d}'.format(epoch=epoch)) + assert os.path.isdir(cp_path) + model = load_model(cp_path, compile=False) + all_weights.append(model.get_weights()) + + new_weights = [] + for layer_weights in zip(*all_weights): + layer_weights = np.array([np.array(weights).mean(axis=0) + for weights in zip(*layer_weights)]) + new_weights.append(layer_weights) - new_weights = [np.array(x) for x in new_weights] - model_weight_averaged=tf.keras.models.clone_model(model) - model_weight_averaged.set_weights(new_weights) - - model_weight_averaged.save(os.path.join(dir_output,'model_ens_avg')) - with open(os.path.join( os.path.join(dir_output,'model_ens_avg'), "config.json"), "w") as fp: + #model = tf.keras.models.clone_model(model) + model.set_weights(new_weights) + + cp_path = os.path.join(dir_output, 'model_ens_avg') + model.save(cp_path) + with open(os.path.join(cp_path, "config.json"), "w") as fp: json.dump(_config, fp) # encode dict into JSON - - with open(os.path.join( os.path.join(dir_output,'model_best'), "config.json"), "w") as fp: - json.dump(_config, fp) # encode dict into JSON + print("ensemble model saved under", cp_path) elif task=='reading_order': configuration() @@ -505,7 +498,8 @@ def run(_config, n_classes, n_epochs, input_height, optimizer=Adam(learning_rate=0.0001), # rs: why not learning_rate? metrics=['accuracy']) - callbacks = [TensorBoard(os.path.join(dir_output, 'logs'), write_graph=False)] + callbacks = [TensorBoard(os.path.join(dir_output, 'logs'), write_graph=False), + SaveWeightsAfterSteps(0, dir_output, _config)] if save_interval: callbacks.append(SaveWeightsAfterSteps(save_interval, dir_output, _config)) @@ -514,20 +508,16 @@ def run(_config, n_classes, n_epochs, input_height, n_batch, input_height, input_width, n_classes, thetha, augmentation) - for i in range(n_epochs): - history = model.fit(trainXY, - steps_per_epoch=num_rows / n_batch, - verbose=1, - callbacks=callbacks) - model.save(os.path.join(dir_output, 'model_'+str(i+indexer_start) )) - - with open(os.path.join(os.path.join(dir_output,'model_'+str(i)),"config.json"), "w") as fp: - json.dump(_config, fp) # encode dict into JSON - ''' - if f1score>f1score_tot[0]: - f1score_tot[0] = f1score - model_dir = os.path.join(dir_out,'model_best') - model.save(model_dir) - ''' + history = model.fit(trainXY, + steps_per_epoch=num_rows / n_batch, + verbose=1, + epochs=n_epochs, + callbacks=callbacks) + ''' + if f1score>f1score_tot[0]: + f1score_tot[0] = f1score + model_dir = os.path.join(dir_out,'model_best') + model.save(model_dir) + ''' diff --git a/train/requirements.txt b/train/requirements.txt index 63f3813..8ad884d 100644 --- a/train/requirements.txt +++ b/train/requirements.txt @@ -1,6 +1,6 @@ sacred seaborn -numpy <1.24.0 +numpy tqdm imutils scipy From 6a81db934e16971bc7edcf4b0b41a918dc444d5c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 22 Jan 2026 11:25:50 +0100 Subject: [PATCH 51/91] improve docs/train.md --- docs/train.md | 168 ++++++++++++++++++++++++++++---------------------- 1 file changed, 96 insertions(+), 72 deletions(-) diff --git a/docs/train.md b/docs/train.md index 252bead..4e76740 100644 --- a/docs/train.md +++ b/docs/train.md @@ -9,9 +9,9 @@ on how to generate the corresponding training dataset. The following three tasks can all be accomplished using the code in the [`train`](https://github.com/qurator-spk/eynollah/tree/main/train) directory: -* generate training dataset -* train a model -* inference with the trained model +* [Generate training dataset](#generate-training-dataset) +* [Train a model](#train-a-model) +* [Inference with the trained model](#inference-with-the-trained-model) ## Training, evaluation and output @@ -63,7 +63,7 @@ serve as labels. The enhancement model can be trained with this generated datase For machine-based reading order, we aim to determine the reading priority between two sets of text regions. The model's input is a three-channel image: the first and last channels contain information about each of the two text regions, while the middle channel encodes prominent layout elements necessary for reading order, such as separators and headers. -To generate the training dataset, our script requires a page XML file that specifies the image layout with the correct +To generate the training dataset, our script requires a PAGE XML file that specifies the image layout with the correct reading order. For output images, it is necessary to specify the width and height. Additionally, a minimum text region size can be set @@ -82,8 +82,14 @@ eynollah-training generate-gt machine-based-reading-order \ ### pagexml2label -pagexml2label is designed to generate labels from GT page XML files for various pixel-wise segmentation use cases, -including 'layout,' 'textline,' 'printspace,' 'glyph,' and 'word' segmentation. +`pagexml2label` is designed to generate labels from PAGE XML GT files for various pixel-wise segmentation use cases, +including: +- `printspace` (i.e. page frame), +- `layout` (i.e. regions), +- `textline`, +- `word`, and +- `glyph`. + To train a pixel-wise segmentation model, we require images along with their corresponding labels. Our training script expects a PNG image where each pixel corresponds to a label, represented by an integer. The background is always labeled as zero, while other elements are assigned different integers. For instance, if we have ground truth data with four @@ -93,7 +99,7 @@ In binary segmentation scenarios such as textline or page extraction, the backgr element is automatically encoded as 1 in the PNG label. To specify the desired use case and the elements to be extracted in the PNG labels, a custom JSON file can be passed. -For example, in the case of 'textline' detection, the JSON file would resemble this: +For example, in the case of textline detection, the JSON contents could be this: ```yaml { @@ -101,61 +107,77 @@ For example, in the case of 'textline' detection, the JSON file would resemble t } ``` -In the case of layout segmentation a custom config json file can look like this: +In the case of layout segmentation, the config JSON file might look like this: ```yaml { "use_case": "layout", -"textregions":{"rest_as_paragraph":1 , "drop-capital": 1, "header":2, "heading":2, "marginalia":3}, -"imageregion":4, -"separatorregion":5, -"graphicregions" :{"rest_as_decoration":6 ,"stamp":7} +"textregions": {"rest_as_paragraph": 1, "drop-capital": 1, "header": 2, "heading": 2, "marginalia": 3}, +"imageregion": 4, +"separatorregion": 5, +"graphicregions": {"rest_as_decoration": 6, "stamp": 7} } ``` -A possible custom config json file for layout segmentation where the "printspace" is a class: +The same example if `PrintSpace` (or `Border`) should be represented as a unique class: ```yaml { "use_case": "layout", -"textregions":{"rest_as_paragraph":1 , "drop-capital": 1, "header":2, "heading":2, "marginalia":3}, -"imageregion":4, -"separatorregion":5, -"graphicregions" :{"rest_as_decoration":6 ,"stamp":7} -"printspace_as_class_in_layout" : 8 +"textregions": {"rest_as_paragraph": 1, "drop-capital": 1, "header": 2, "heading": 2, "marginalia": 3}, +"imageregion": 4, +"separatorregion": 5, +"graphicregions": {"rest_as_decoration": 6, "stamp": 7} +"printspace_as_class_in_layout": 8 } ``` -For the layout use case, it is beneficial to first understand the structure of the page XML file and its elements. -In a given image, the annotations of elements are recorded in a page XML file, including their contours and classes. -For an image document, the known regions are 'textregion', 'separatorregion', 'imageregion', 'graphicregion', -'noiseregion', and 'tableregion'. +In the `layout` use-case, it is beneficial to first understand the structure of the PAGE XML file and its elements. +For a given page image, the visible segments are annotated in XML with their polygon coordinates and types. +On the region level, available segment types include `TextRegion`, `SeparatorRegion`, `ImageRegion`, `GraphicRegion`, +`NoiseRegion` and `TableRegion`. -Text regions and graphic regions also have their own specific types. The known types for text regions are 'paragraph', -'header', 'heading', 'marginalia', 'drop-capital', 'footnote', 'footnote-continued', 'signature-mark', 'page-number', -and 'catch-word'. The known types for graphic regions are 'handwritten-annotation', 'decoration', 'stamp', and -'signature'. -Since we don't know all types of text and graphic regions, unknown cases can arise. To handle these, we have defined -two additional types, "rest_as_paragraph" and "rest_as_decoration", to ensure that no unknown types are missed. -This way, users can extract all known types from the labels and be confident that no unknown types are overlooked. +Moreover, text regions and graphic regions in particular are subdivided via `@type`: +- The allowed subtypes for text regions are `paragraph`, `heading`, `marginalia`, `drop-capital`, `header`, `footnote`, +`footnote-continued`, `signature-mark`, `page-number` and `catch-word`. +- The known subtypes for graphic regions are `handwritten-annotation`, `decoration`, `stamp` and `signature`. -In the custom JSON file shown above, "header" and "heading" are extracted as the same class, while "marginalia" is shown -as a different class. All other text region types, including "drop-capital," are grouped into the same class. For the -graphic region, "stamp" has its own class, while all other types are classified together. "Image region" and "separator -region" are also present in the label. However, other regions like "noise region" and "table region" will not be -included in the label PNG file, even if they have information in the page XML files, as we chose not to include them. +These types and subtypes must be mapped to classes for the segmentation model. However, sometimes these fine-grained +distinctions are not useful or the existing annotations are not very usable (too scarce or too unreliable). +In that case, instead of these subtypes with a specific mapping, they can be pooled together by using the two special +types: +- `rest_as_paragraph` (mapping missing TextRegion subtypes and `paragraph`) +- `rest_as_decoration` (mapping missing GraphicRegion subtypes and `decoration`) + +(That way, users can extract all known types from the labels and be confident that no subtypes are overlooked.) + +In the custom JSON example shown above, `header` and `heading` are extracted as the same class, +while `marginalia` is modelled as a different class. All other text region types, including `drop-capital`, +are grouped into the same class. For graphic regions, `stamp` has its own class, while all other types +are classified together. `ImageRegion` and `SeparatorRegion` will also represented with a class label in the +training data. However, other regions like `NoiseRegion` or `TableRegion` will not be included in the PNG files, +even if they were present in the PAGE XML. + +The tool expects various command-line options: ```sh eynollah-training generate-gt pagexml2label \ - -dx "dir of GT xml files" \ - -do "dir where output label png files will be written" \ - -cfg "custom config json file" \ - -to "output type which has 2d and 3d. 2d is used for training and 3d is just to visualise the labels" + -dx "dir of input PAGE XML files" \ + -do "dir of output label PNG files" \ + -cfg "custom config JSON file" \ + -to "output type (2d or 3d)" ``` -We have also defined an artificial class that can be added to the boundary of text region types or text lines. This key -is called "artificial_class_on_boundary." If users want to apply this to certain text regions in the layout use case, -the example JSON config file should look like this: +As output type, use +- `2d` for training, +- `3d` to just visualise the labels. + +We have also defined an artificial class that can be added to (rendered around) the boundary +of text region types or text lines in order to make separation of neighbouring segments more +reliable. The key is called `artificial_class_on_boundary`, and it takes a list of text region +types to be applied to. + +Our example JSON config file could then look like this: ```yaml { @@ -177,14 +199,15 @@ the example JSON config file should look like this: } ``` -This implies that the artificial class label, denoted by 7, will be present on PNG files and will only be added to the -elements labeled as "paragraph," "header," "heading," and "marginalia." +This implies that the artificial class label (denoted by 7) will be present in the generated PNG files +and will only be added around segments labeled `paragraph`, `header`, `heading` or `marginalia`. (This +class will be handled specially during decoding at inference, and not show up in final results.) -For "textline", "word", and "glyph", the artificial class on the boundaries will be activated only if the -"artificial_class_label" key is specified in the config file. Its value should be set as 2 since these elements -represent binary cases. For example, if the background and textline are denoted as 0 and 1 respectively, then the -artificial class should be assigned the value 2. The example JSON config file should look like this for "textline" use -case: +For `printspace`, `textline`, `word`, and `glyph` segmentation use-cases, there is no `artificial_class_on_boundary` key, +but `artificial_class_label` is available. If specified in the config file, then its value should be set at 2, because +these elements represent binary classification problems (with background represented as 0, and segments as 1, respectively). + +For example, the JSON config for textline detection could look as follows: ```yaml { @@ -193,33 +216,33 @@ case: } ``` -If the coordinates of "PrintSpace" or "Border" are present in the page XML ground truth files, and the user wishes to -crop only the print space area, this can be achieved by activating the "-ps" argument. However, it should be noted that -in this scenario, since cropping will be applied to the label files, the directory of the original images must be -provided to ensure that they are cropped in sync with the labels. This ensures that the correct images and labels -required for training are obtained. The command should resemble the following: +If the coordinates of `PrintSpace` (or `Border`) are present in the PAGE XML ground truth files, +and one wishes to crop images to only cover the print space bounding box, this can be achieved +by passing the `-ps` option. Note that in this scenario, the directory of the original images +must also be provided, to ensure that the images are cropped in sync with the labels. The command +line would then resemble this: ```sh eynollah-training generate-gt pagexml2label \ - -dx "dir of GT xml files" \ - -do "dir where output label png files will be written" \ - -cfg "custom config json file" \ - -to "output type which has 2d and 3d. 2d is used for training and 3d is just to visualise the labels" \ + -dx "dir of input PAGE XML files" \ + -do "dir of output label PNG files" \ + -cfg "custom config JSON file" \ + -to "output type (2d or 3d)" \ -ps \ - -di "dir where the org images are located" \ - -doi "dir where the cropped output images will be written" + -di "dir of input original images" \ + -doi "dir of output cropped images" ``` ## Train a model ### classification -For the classification use case, we haven't provided a ground truth generator, as it's unnecessary. For classification, -all we require is a training directory with subdirectories, each containing images of its respective classes. We need +For the image classification use-case, we have not provided a ground truth generator, as it is unnecessary. +All we require is a training directory with subdirectories, each containing images of its respective classes. We need separate directories for training and evaluation, and the class names (subdirectories) must be consistent across both directories. Additionally, the class names should be specified in the config JSON file, as shown in the following example. If, for instance, we aim to classify "apple" and "orange," with a total of 2 classes, the -"classification_classes_name" key in the config file should appear as follows: +`classification_classes_name` key in the config file should appear as follows: ```yaml { @@ -241,7 +264,7 @@ example. If, for instance, we aim to classify "apple" and "orange," with a total } ``` -The "dir_train" should be like this: +Then `dir_train` should be like this: ``` . @@ -250,7 +273,7 @@ The "dir_train" should be like this: └── orange # directory of images for orange class ``` -And the "dir_eval" the same structure as train directory: +And `dir_eval` analogously: ``` . @@ -310,7 +333,7 @@ And the "dir_eval" the same structure as train directory: └── labels # directory of labels ``` -The classification model can be trained like the classification case command line. +The reading-order model can be trained like the classification case command line. ### Segmentation (Textline, Binarization, Page extraction and layout) and enhancement @@ -374,9 +397,9 @@ classification and machine-based reading order, as you can see in their example * `transformer_num_heads`: Transformer number of heads. Default value is 4. * `transformer_cnn_first`: We have two types of vision transformers. In one type, a CNN is applied first, followed by a transformer. In the other type, this order is reversed. If transformer_cnn_first is true, it means the CNN will be applied before the transformer. Default value is true. -In the case of segmentation and enhancement the train and evaluation directory should be as following. +In case of segmentation and enhancement the train and evaluation data should be organised as follows. -The "dir_train" should be like this: +The "dir_train" directory should be like this: ``` . @@ -394,11 +417,12 @@ And the "dir_eval" the same structure as train directory: └── labels # directory of labels ``` -After configuring the JSON file for segmentation or enhancement, training can be initiated by running the following -command, similar to the process for classification and reading order: +After configuring the JSON file for segmentation or enhancement, +training can be initiated by running the following command line, +similar to classification and reading-order model training: -``` -eynollah-training train with config_classification.json` +```sh +eynollah-training train with config_classification.json ``` #### Binarization @@ -690,7 +714,7 @@ This will straightforwardly return the class of the image. ### machine based reading order -To infer the reading order using a reading order model, we need a page XML file containing layout information but +To infer the reading order using a reading order model, we need a PAGE XML file containing layout information but without the reading order. We simply need to provide the model directory, the XML file, and the output directory. The new XML file with the added reading order will be written to the output directory with the same name. We need to run: From eb92760f73f9d8eefa9028ea697c4152d07e39ec Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 22 Jan 2026 19:49:39 +0100 Subject: [PATCH 52/91] training: download pretrained RESNET weights if missing --- src/eynollah/training/models.py | 17 ++++++++++------- src/eynollah/training/train.py | 15 ++++++++++++++- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/src/eynollah/training/models.py b/src/eynollah/training/models.py index 3b38fe8..011c614 100644 --- a/src/eynollah/training/models.py +++ b/src/eynollah/training/models.py @@ -12,7 +12,10 @@ from tensorflow.keras.regularizers import l2 ###projection_dim = 64 ##transformer_layers = 2#8 ##num_heads = 1#4 -resnet50_Weights_path = './pretrained_model/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5' +RESNET50_WEIGHTS_PATH = './pretrained_model/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5' +RESNET50_WEIGHTS_URL = ('https://github.com/fchollet/deep-learning-models/releases/download/v0.2/' + 'resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5') + IMAGE_ORDERING = 'channels_last' MERGE_AXIS = -1 @@ -242,7 +245,7 @@ def resnet50_unet_light(n_classes, input_height=224, input_width=224, taks="segm f5 = x if pretraining: - model = Model(img_input, x).load_weights(resnet50_Weights_path) + model = Model(img_input, x).load_weights(RESNET50_WEIGHTS_PATH) v512_2048 = Conv2D(512, (1, 1), padding='same', data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay))(f5) v512_2048 = (BatchNormalization(axis=bn_axis))(v512_2048) @@ -343,7 +346,7 @@ def resnet50_unet(n_classes, input_height=224, input_width=224, task="segmentati f5 = x if pretraining: - Model(img_input, x).load_weights(resnet50_Weights_path) + Model(img_input, x).load_weights(RESNET50_WEIGHTS_PATH) v1024_2048 = Conv2D(1024, (1, 1), padding='same', data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay))( f5) @@ -442,7 +445,7 @@ def vit_resnet50_unet(n_classes, patch_size_x, patch_size_y, num_patches, mlp_he f5 = x if pretraining: - model = Model(inputs, x).load_weights(resnet50_Weights_path) + model = Model(inputs, x).load_weights(RESNET50_WEIGHTS_PATH) #num_patches = x.shape[1]*x.shape[2] @@ -590,7 +593,7 @@ def vit_resnet50_unet_transformer_before_cnn(n_classes, patch_size_x, patch_size f5 = x if pretraining: - model = Model(encoded_patches, x).load_weights(resnet50_Weights_path) + model = Model(encoded_patches, x).load_weights(RESNET50_WEIGHTS_PATH) v1024_2048 = Conv2D( 1024 , (1, 1), padding='same', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay))(x) v1024_2048 = (BatchNormalization(axis=bn_axis))(v1024_2048) @@ -690,7 +693,7 @@ def resnet50_classifier(n_classes,input_height=224,input_width=224,weight_decay= f5 = x if pretraining: - Model(img_input, x).load_weights(resnet50_Weights_path) + Model(img_input, x).load_weights(RESNET50_WEIGHTS_PATH) x = AveragePooling2D((7, 7), name='avg_pool')(x) x = Flatten()(x) @@ -746,7 +749,7 @@ def machine_based_reading_order_model(n_classes,input_height=224,input_width=224 x1 = identity_block(x1, 3, [512, 512, 2048], stage=5, block='c') if pretraining: - Model(img_input , x1).load_weights(resnet50_Weights_path) + Model(img_input , x1).load_weights(RESNET50_WEIGHTS_PATH) x1 = AveragePooling2D((7, 7), name='avg_pool1')(x1) flattened = Flatten()(x1) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index 7ee63f9..6353474 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -2,6 +2,7 @@ import os import sys import json +import requests import click from eynollah.training.metrics import ( @@ -15,7 +16,9 @@ from eynollah.training.models import ( resnet50_classifier, resnet50_unet, vit_resnet50_unet, - vit_resnet50_unet_transformer_before_cnn + vit_resnet50_unet_transformer_before_cnn, + RESNET50_WEIGHTS_PATH, + RESNET50_WEIGHTS_URL ) from eynollah.training.utils import ( data_gen, @@ -80,6 +83,12 @@ def get_dirs_or_files(input_data): assert os.path.isdir(labels_input), "{} is not a directory".format(labels_input) return image_input, labels_input +def download_file(url, path): + with open(path, 'wb') as f: + with requests.get(url, stream=True) as r: + r.raise_for_status() + for data in r.iter_content(chunk_size=4096): + f.write(data) ex = Experiment(save_git_info=False) @@ -163,6 +172,10 @@ def run(_config, n_classes, n_epochs, input_height, transformer_patchsize_x, transformer_patchsize_y, transformer_num_patches_xy, backbone_type, save_interval, flip_index, dir_eval, dir_output, pretraining, learning_rate, task, f1_threshold_classification, classification_classes_name, dir_img_bin, number_of_backgrounds_per_image,dir_rgb_backgrounds, dir_rgb_foregrounds): + + if pretraining and not os.path.isfile(RESNET50_WEIGHTS_PATH): + print("downloading RESNET50 pretrained weights to", RESNET50_WEIGHTS_PATH) + download_file(RESNET50_WEIGHTS_URL, RESNET50_WEIGHTS_PATH) if dir_rgb_backgrounds: list_all_possible_background_images = os.listdir(dir_rgb_backgrounds) From acda9c84eecca75e5260b2172923f59e86838a73 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 28 Jan 2026 13:28:03 +0100 Subject: [PATCH 53/91] =?UTF-8?q?training.gt=5Fgen=5Futils:=20improve=20XM?= =?UTF-8?q?L=E2=86=92img=20path=20mapping=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit when matching files in `dir_images` by XML path name stem, * use `dict` instead of `list` to assign reliably * filter out `.xml` files (so input directories can be mixed) * show informative warnings for files which cannot be matched --- src/eynollah/training/gt_gen_utils.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/src/eynollah/training/gt_gen_utils.py b/src/eynollah/training/gt_gen_utils.py index b7c35ee..f4defdd 100644 --- a/src/eynollah/training/gt_gen_utils.py +++ b/src/eynollah/training/gt_gen_utils.py @@ -627,7 +627,10 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ if dir_images: ls_org_imgs = os.listdir(dir_images) - ls_org_imgs_stem = [os.path.splitext(item)[0] for item in ls_org_imgs] + ls_org_imgs = {os.path.splitext(item)[0]: item + for item in ls_org_imgs + if not item.endswith('.xml')} + for index in tqdm(range(len(gt_list))): #try: print(gt_list[index]) @@ -802,7 +805,13 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ cv2.imwrite(os.path.join(output_dir, xml_file_stem + '.png'), img_poly) if dir_images: - org_image_name = ls_org_imgs[ls_org_imgs_stem.index(xml_file_stem)] + org_image_name = ls_org_imgs[xml_file_stem] + if not org_image_name: + print("image file for XML stem", xml_file_stem, "is missing") + continue + if not os.path.isfile(os.path.join(dir_images, org_image_name)): + print("image file for XML stem", xml_file_stem, "is not readable") + continue img_org = cv2.imread(os.path.join(dir_images, org_image_name)) if printspace and config_params['use_case']!='printspace': @@ -1266,7 +1275,13 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ if dir_images: - org_image_name = ls_org_imgs[ls_org_imgs_stem.index(xml_file_stem)] + org_image_name = ls_org_imgs[xml_file_stem] + if not org_image_name: + print("image file for XML stem", xml_file_stem, "is missing") + continue + if not os.path.isfile(os.path.join(dir_images, org_image_name)): + print("image file for XML stem", xml_file_stem, "is not readable") + continue img_org = cv2.imread(os.path.join(dir_images, org_image_name)) if printspace: From 0372fd7a1ec2e4d654c0f24171c9b30c77a3e09b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 28 Jan 2026 13:42:59 +0100 Subject: [PATCH 54/91] =?UTF-8?q?training.gt=5Fgen=5Futils:=20fix+simplify?= =?UTF-8?q?=20cropping=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit when parsing `PrintSpace` or `Border` from PAGE-XML, - use `lxml` XPath instead of nested loops - convert points to polygons directly (instead of painting on canvas and retrieving contours) - pass result bbox in slice notation (instead of xywh) --- src/eynollah/training/gt_gen_utils.py | 151 ++++++++------------------ src/eynollah/training/inference.py | 18 ++- 2 files changed, 51 insertions(+), 118 deletions(-) diff --git a/src/eynollah/training/gt_gen_utils.py b/src/eynollah/training/gt_gen_utils.py index f4defdd..f068afd 100644 --- a/src/eynollah/training/gt_gen_utils.py +++ b/src/eynollah/training/gt_gen_utils.py @@ -1,15 +1,18 @@ import os import numpy as np import warnings -import xml.etree.ElementTree as ET +from lxml import etree as ET from tqdm import tqdm import cv2 from shapely import geometry from pathlib import Path from PIL import ImageFont +from ocrd_utils import bbox_from_points KERNEL = np.ones((5, 5), np.uint8) +NS = { 'pc': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15' +} with warnings.catch_warnings(): warnings.simplefilter("ignore") @@ -664,52 +667,13 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ y_new = int ( x_new * (y_len / float(x_len)) ) if printspace or "printspace_as_class_in_layout" in list(config_params.keys()): - region_tags = np.unique([x for x in alltags if x.endswith('PrintSpace') or x.endswith('Border')]) - co_use_case = [] - - for tag in region_tags: - tag_endings = ['}PrintSpace','}Border'] - - if tag.endswith(tag_endings[0]) or tag.endswith(tag_endings[1]): - for nn in root1.iter(tag): - c_t_in = [] - sumi = 0 - for vv in nn.iter(): - # check the format of coords - if vv.tag == link + 'Coords': - coords = bool(vv.attrib) - if coords: - p_h = vv.attrib['points'].split(' ') - c_t_in.append( - np.array([[int(x.split(',')[0]), int(x.split(',')[1])] for x in p_h])) - break - else: - pass - - if vv.tag == link + 'Point': - c_t_in.append([int(float(vv.attrib['x'])), int(float(vv.attrib['y']))]) - sumi += 1 - elif vv.tag != link + 'Point' and sumi >= 1: - break - co_use_case.append(np.array(c_t_in)) - - img = np.zeros((y_len, x_len, 3)) - - img_poly = cv2.fillPoly(img, pts=co_use_case, color=(1, 1, 1)) - - img_poly = img_poly.astype(np.uint8) - - imgray = cv2.cvtColor(img_poly, cv2.COLOR_BGR2GRAY) - _, thresh = cv2.threshold(imgray, 0, 255, 0) - - contours, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - - cnt_size = np.array([cv2.contourArea(contours[j]) for j in range(len(contours))]) - - cnt = contours[np.argmax(cnt_size)] - - x, y, w, h = cv2.boundingRect(cnt) - bb_xywh = [x, y, w, h] + ps = (root1.xpath('/pc:PcGts/pc:Page/pc:Border', namespaces=NS) + + root1.xpath('/pc:PcGts/pc:Page/pc:PrintSpace', namespaces=NS)) + if len(ps): + points = ps[0].find('pc:Coords', NS).get('points') + ps_bbox = bbox_from_points(points) + else: + ps_bbox = [0, 0, None, None] if config_file and (config_params['use_case']=='textline' or config_params['use_case']=='word' or config_params['use_case']=='glyph' or config_params['use_case']=='printspace'): @@ -791,7 +755,8 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ if printspace and config_params['use_case']!='printspace': - img_poly = img_poly[bb_xywh[1]:bb_xywh[1]+bb_xywh[3], bb_xywh[0]:bb_xywh[0]+bb_xywh[2], :] + img_poly = img_poly[ps_bbox[1]:ps_bbox[3], + ps_bbox[0]:ps_bbox[2], :] if 'columns_width' in list(config_params.keys()) and num_col and config_params['use_case']!='printspace': @@ -815,7 +780,8 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ img_org = cv2.imread(os.path.join(dir_images, org_image_name)) if printspace and config_params['use_case']!='printspace': - img_org = img_org[bb_xywh[1]:bb_xywh[1]+bb_xywh[3], bb_xywh[0]:bb_xywh[0]+bb_xywh[2], :] + img_org = img_org[ps_bbox[1]:ps_bbox[3], + ps_bbox[0]:ps_bbox[2], :] if 'columns_width' in list(config_params.keys()) and num_col and config_params['use_case']!='printspace': img_org = resize_image(img_org, y_new, x_new) @@ -1194,7 +1160,8 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ if "printspace_as_class_in_layout" in list(config_params.keys()): printspace_mask = np.zeros((img_poly.shape[0], img_poly.shape[1])) - printspace_mask[bb_xywh[1]:bb_xywh[1]+bb_xywh[3], bb_xywh[0]:bb_xywh[0]+bb_xywh[2]] = 1 + printspace_mask[ps_bbox[1]:ps_bbox[3], + ps_bbox[0]:ps_bbox[2]] = 1 img_poly[:,:,0][printspace_mask[:,:] == 0] = printspace_class_rgb_color[0] img_poly[:,:,1][printspace_mask[:,:] == 0] = printspace_class_rgb_color[1] @@ -1252,7 +1219,8 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ if "printspace_as_class_in_layout" in list(config_params.keys()): printspace_mask = np.zeros((img_poly.shape[0], img_poly.shape[1])) - printspace_mask[bb_xywh[1]:bb_xywh[1]+bb_xywh[3], bb_xywh[0]:bb_xywh[0]+bb_xywh[2]] = 1 + printspace_mask[ps_bbox[1]:ps_bbox[3], + ps_bbox[0]:ps_bbox[2]] = 1 img_poly[:,:,0][printspace_mask[:,:] == 0] = printspace_class_label img_poly[:,:,1][printspace_mask[:,:] == 0] = printspace_class_label @@ -1261,7 +1229,8 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ if printspace: - img_poly = img_poly[bb_xywh[1]:bb_xywh[1]+bb_xywh[3], bb_xywh[0]:bb_xywh[0]+bb_xywh[2], :] + img_poly = img_poly[ps_bbox[1]:ps_bbox[3], + ps_bbox[0]:ps_bbox[2], :] if 'columns_width' in list(config_params.keys()) and num_col: img_poly = resize_image(img_poly, y_new, x_new) @@ -1285,7 +1254,8 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ img_org = cv2.imread(os.path.join(dir_images, org_image_name)) if printspace: - img_org = img_org[bb_xywh[1]:bb_xywh[1]+bb_xywh[3], bb_xywh[0]:bb_xywh[0]+bb_xywh[2], :] + img_org = img_org[ps_bbox[1]:ps_bbox[3], + ps_bbox[0]:ps_bbox[2], :] if 'columns_width' in list(config_params.keys()) and num_col: img_org = resize_image(img_org, y_new, x_new) @@ -1326,6 +1296,7 @@ def find_new_features_of_contours(contours_main): y_max_main = np.array([np.max(contours_main[j][:, 1]) for j in range(len(contours_main))]) return cx_main, cy_main, x_min_main, x_max_main, y_min_main, y_max_main, y_corr_x_min_from_argmin + def read_xml(xml_file): file_name = Path(xml_file).stem tree1 = ET.parse(xml_file, parser = ET.XMLParser(encoding='utf-8')) @@ -1344,57 +1315,13 @@ def read_xml(xml_file): index_tot_regions.append(jj.attrib['index']) tot_region_ref.append(jj.attrib['regionRef']) - if (link+'PrintSpace' in alltags) or (link+'Border' in alltags): - co_printspace = [] - if link+'PrintSpace' in alltags: - region_tags_printspace = np.unique([x for x in alltags if x.endswith('PrintSpace')]) - elif link+'Border' in alltags: - region_tags_printspace = np.unique([x for x in alltags if x.endswith('Border')]) - - for tag in region_tags_printspace: - if link+'PrintSpace' in alltags: - tag_endings_printspace = ['}PrintSpace','}printspace'] - elif link+'Border' in alltags: - tag_endings_printspace = ['}Border','}border'] - - if tag.endswith(tag_endings_printspace[0]) or tag.endswith(tag_endings_printspace[1]): - for nn in root1.iter(tag): - c_t_in = [] - sumi = 0 - for vv in nn.iter(): - # check the format of coords - if vv.tag == link + 'Coords': - coords = bool(vv.attrib) - if coords: - p_h = vv.attrib['points'].split(' ') - c_t_in.append( - np.array([[int(x.split(',')[0]), int(x.split(',')[1])] for x in p_h])) - break - else: - pass - - if vv.tag == link + 'Point': - c_t_in.append([int(float(vv.attrib['x'])), int(float(vv.attrib['y']))]) - sumi += 1 - elif vv.tag != link + 'Point' and sumi >= 1: - break - co_printspace.append(np.array(c_t_in)) - img_printspace = np.zeros( (y_len,x_len,3) ) - img_printspace=cv2.fillPoly(img_printspace, pts =co_printspace, color=(1,1,1)) - img_printspace = img_printspace.astype(np.uint8) - - imgray = cv2.cvtColor(img_printspace, cv2.COLOR_BGR2GRAY) - _, thresh = cv2.threshold(imgray, 0, 255, 0) - contours, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - cnt_size = np.array([cv2.contourArea(contours[j]) for j in range(len(contours))]) - cnt = contours[np.argmax(cnt_size)] - x, y, w, h = cv2.boundingRect(cnt) - - bb_coord_printspace = [x, y, w, h] - + ps = (root1.xpath('/pc:PcGts/pc:Page/pc:Border', namespaces=NS) + + root1.xpath('/pc:PcGts/pc:Page/pc:PrintSpace', namespaces=NS)) + if len(ps): + points = ps[0].find('pc:Coords', NS).get('points') + ps_bbox = bbox_from_points(points) else: - bb_coord_printspace = None - + ps_bbox = [0, 0, None, None] region_tags=np.unique([x for x in alltags if x.endswith('Region')]) co_text_paragraph=[] @@ -1749,11 +1676,19 @@ def read_xml(xml_file): img_poly=cv2.fillPoly(img, pts =co_img, color=(4,4,4)) img_poly=cv2.fillPoly(img, pts =co_sep, color=(5,5,5)) - return tree1, root1, bb_coord_printspace, file_name, id_paragraph, id_header+id_heading, co_text_paragraph, co_text_header+co_text_heading,\ -tot_region_ref,x_len, y_len,index_tot_regions, img_poly - - - + return (tree1, + root1, + ps_bbox, + file_name, + id_paragraph, + id_header + id_heading, + co_text_paragraph, + co_text_header + co_text_heading, + tot_region_ref, + x_len, + y_len, + index_tot_regions, + img_poly) def bounding_box(cnt,color, corr_order_index ): x, y, w, h = cv2.boundingRect(cnt) diff --git a/src/eynollah/training/inference.py b/src/eynollah/training/inference.py index 15d1e6a..2ef1a91 100644 --- a/src/eynollah/training/inference.py +++ b/src/eynollah/training/inference.py @@ -196,7 +196,7 @@ class SBBPredict: img_height = self.config_params_model['input_height'] img_width = self.config_params_model['input_width'] - tree_xml, root_xml, bb_coord_printspace, file_name, \ + tree_xml, root_xml, ps_bbox, file_name, \ id_paragraph, id_header, \ co_text_paragraph, co_text_header, \ tot_region_ref, x_len, y_len, index_tot_regions, \ @@ -236,15 +236,13 @@ class SBBPredict: img_label=cv2.fillPoly(img_label, pts =[co_text_all[i]], color=(1,1,1)) labels_con[:,:,i] = img_label[:,:,0] - if bb_coord_printspace: - #bb_coord_printspace[x,y,w,h,_,_] - x = bb_coord_printspace[0] - y = bb_coord_printspace[1] - w = bb_coord_printspace[2] - h = bb_coord_printspace[3] - labels_con = labels_con[y:y+h, x:x+w, :] - img_poly = img_poly[y:y+h, x:x+w, :] - img_header_and_sep = img_header_and_sep[y:y+h, x:x+w] + if ps_bbox: + labels_con = labels_con[ps_bbox[1]:ps_bbox[3], + ps_bbox[0]:ps_bbox[2], :] + img_poly = img_poly[ps_bbox[1]:ps_bbox[3], + ps_bbox[0]:ps_bbox[2], :] + img_header_and_sep = img_header_and_sep[ps_bbox[1]:ps_bbox[3], + ps_bbox[0]:ps_bbox[2]] From e69b35b49c4e7816b0e88d0d5d48f79aaf3f46db Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 28 Jan 2026 13:49:23 +0100 Subject: [PATCH 55/91] training.train.config_params: re-organise to reflect dependencies - re-order keys belonging together logically - make keys dependent on each other --- src/eynollah/training/train.py | 222 +++++++++++++++++---------------- 1 file changed, 115 insertions(+), 107 deletions(-) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index 6353474..e93281a 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -95,136 +95,144 @@ ex = Experiment(save_git_info=False) @ex.config def config_params(): + task = "segmentation" # This parameter defines task of model which can be segmentation, enhancement or classification. + backbone_type = None # Type of image feature map network backbone. Either a vision transformer alongside a CNN we call "transformer", or only a CNN which we call "nontransformer" n_classes = None # Number of classes. In the case of binary classification this should be 2. - n_epochs = 1 # Number of epochs. + n_epochs = 1 # Number of epochs to train. + n_batch = 1 # Number of images per batch at each iteration. (Try as large as fits on VRAM.) input_height = 224 * 1 # Height of model's input in pixels. input_width = 224 * 1 # Width of model's input in pixels. weight_decay = 1e-6 # Weight decay of l2 regularization of model layers. - n_batch = 1 # Number of batches at each iteration. learning_rate = 1e-4 # Set the learning rate. - patches = False # Divides input image into smaller patches (input size of the model) when set to true. For the model to see the full image, like page extraction, set this to false. - augmentation = False # To apply any kind of augmentation, this parameter must be set to true. - flip_aug = False # If true, different types of flipping will be applied to the image. Types of flips are defined with "flip_index" in config_params.json. - blur_aug = False # If true, different types of blurring will be applied to the image. Types of blur are defined with "blur_k" in config_params.json. - padding_white = False # If true, white padding will be applied to the image. - padding_black = False # If true, black padding will be applied to the image. - scaling = False # If true, scaling will be applied to the image. The amount of scaling is defined with "scales" in config_params.json. - shifting = False - degrading = False # If true, degrading will be applied to the image. The amount of degrading is defined with "degrade_scales" in config_params.json. - brightening = False # If true, brightening will be applied to the image. The amount of brightening is defined with "brightness" in config_params.json. - binarization = False # If true, Otsu thresholding will be applied to augment the input with binarized images. - adding_rgb_background = False - adding_rgb_foreground = False - add_red_textlines = False - channels_shuffling = False - dir_train = None # Directory of training dataset with subdirectories having the names "images" and "labels". - dir_eval = None # Directory of validation dataset with subdirectories having the names "images" and "labels". - dir_output = None # Directory where the output model will be saved. - pretraining = False # Set to true to load pretrained weights of ResNet50 encoder. - scaling_bluring = False # If true, a combination of scaling and blurring will be applied to the image. - scaling_binarization = False # If true, a combination of scaling and binarization will be applied to the image. - rotation = False # If true, a 90 degree rotation will be implemeneted. - rotation_not_90 = False # If true rotation based on provided angles with thetha will be implemeneted. - scaling_brightness = False # If true, a combination of scaling and brightening will be applied to the image. - scaling_flip = False # If true, a combination of scaling and flipping will be applied to the image. - thetha = None # Rotate image by these angles for augmentation. - shuffle_indexes = None - blur_k = None # Blur image for augmentation. - scales = None # Scale patches for augmentation. - degrade_scales = None # Degrade image for augmentation. - brightness = None # Brighten image for augmentation. - flip_index = None # Flip image for augmentation. - continue_training = False # Set to true if you would like to continue training an already trained a model. - transformer_patchsize_x = None # Patch size of vision transformer patches in x direction. - transformer_patchsize_y = None # Patch size of vision transformer patches in y direction. - transformer_num_patches_xy = None # Number of patches for vision transformer in x and y direction respectively. - transformer_projection_dim = 64 # Transformer projection dimension. Default value is 64. - transformer_mlp_head_units = [128, 64] # Transformer Multilayer Perceptron (MLP) head units. Default value is [128, 64] - transformer_layers = 8 # transformer layers. Default value is 8. - transformer_num_heads = 4 # Transformer number of heads. Default value is 4. - transformer_cnn_first = True # We have two types of vision transformers. In one type, a CNN is applied first, followed by a transformer. In the other type, this order is reversed. If transformer_cnn_first is true, it means the CNN will be applied before the transformer. Default value is true. - index_start = 0 # Index of model to continue training from. E.g. if you trained for 3 epochs and last index is 2, to continue from model_1.h5, set "index_start" to 3 to start naming model with index 3. - dir_of_start_model = '' # Directory containing pretrained encoder to continue training the model. is_loss_soft_dice = False # Use soft dice as loss function. When set to true, "weighted_loss" must be false. weighted_loss = False # Use weighted categorical cross entropy as loss fucntion. When set to true, "is_loss_soft_dice" must be false. - data_is_provided = False # Only set this to true when you have already provided the input data and the train and eval data are in "dir_output". - task = "segmentation" # This parameter defines task of model which can be segmentation, enhancement or classification. f1_threshold_classification = None # This threshold is used to consider models with an evaluation f1 scores bigger than it. The selected model weights undergo a weights ensembling. And avreage ensembled model will be written to output. classification_classes_name = None # Dictionary of classification classes names. - backbone_type = None # As backbone we have 2 types of backbones. A vision transformer alongside a CNN and we call it "transformer" and only CNN called "nontransformer" - save_interval = None - dir_img_bin = None - number_of_backgrounds_per_image = 1 - dir_rgb_backgrounds = None - dir_rgb_foregrounds = None + patches = False # Divides input image into smaller patches (input size of the model) when set to true. For the model to see the full image, like page extraction, set this to false. + augmentation = False # To apply any kind of augmentation, this parameter must be set to true. + if augmentation: + flip_aug = False # If true, different types of flipping will be applied to the image. Types of flips are defined with "flip_index" in config_params.json. + if flip_aug: + flip_index = None # Flip image for augmentation. + blur_aug = False # If true, different types of blurring will be applied to the image. Types of blur are defined with "blur_k" in config_params.json. + if blur_aug: + blur_k = None # Blur image for augmentation. + padding_white = False # If true, white padding will be applied to the image. + padding_black = False # If true, black padding will be applied to the image. + scaling = False # If true, scaling will be applied to the image. The amount of scaling is defined with "scales" in config_params.json. + scaling_bluring = False # If true, a combination of scaling and blurring will be applied to the image. + scaling_binarization = False # If true, a combination of scaling and binarization will be applied to the image. + scaling_brightness = False # If true, a combination of scaling and brightening will be applied to the image. + scaling_flip = False # If true, a combination of scaling and flipping will be applied to the image. + if scaling or scaling_brightness or scaling_bluring or scaling_binarization or scaling_flip: + scales = None # Scale patches for augmentation. + shifting = False + degrading = False # If true, degrading will be applied to the image. The amount of degrading is defined with "degrade_scales" in config_params.json. + if degrading: + degrade_scales = None # Degrade image for augmentation. + brightening = False # If true, brightening will be applied to the image. The amount of brightening is defined with "brightness" in config_params.json. + if brightening: + brightness = None # Brighten image for augmentation. + binarization = False # If true, Otsu thresholding will be applied to augment the input with binarized images. + if binarization: + dir_img_bin = None # Directory of training dataset subdirectory of binarized images + add_red_textlines = False + adding_rgb_background = False + if adding_rgb_background: + dir_rgb_backgrounds = None # Directory of texture images for synthetic background + adding_rgb_foreground = False + if adding_rgb_foreground: + dir_rgb_foregrounds = None # Directory of texture images for synthetic foreground + if adding_rgb_background or adding_rgb_foreground: + number_of_backgrounds_per_image = 1 + channels_shuffling = False # Re-arrange color channels. + if channels_shuffling: + shuffle_indexes = None # Which channels to switch between. + rotation = False # If true, a 90 degree rotation will be implemeneted. + rotation_not_90 = False # If true rotation based on provided angles with thetha will be implemeneted. + if rotation_not_90: + thetha = None # Rotate image by these angles for augmentation. + dir_train = None # Directory of training dataset with subdirectories having the names "images" and "labels". + dir_eval = None # Directory of validation dataset with subdirectories having the names "images" and "labels". + dir_output = None # Directory where the augmented training data and the model checkpoints will be saved. + pretraining = False # Set to true to (down)load pretrained weights of ResNet50 encoder. + save_interval = None # frequency for writing model checkpoints (nonzero integer for number of batches, or zero for epoch) + continue_training = False # Set to true if you would like to continue training an already trained a model. + dir_of_start_model = '' # Directory containing pretrained encoder to continue training the model. + data_is_provided = False # Only set this to true when you have already provided the input data and the train and eval data are in "dir_output". + if backbone_type == "transformer": + transformer_patchsize_x = None # Patch size of vision transformer patches in x direction. + transformer_patchsize_y = None # Patch size of vision transformer patches in y direction. + transformer_num_patches_xy = None # Number of patches for vision transformer in x and y direction respectively. + transformer_projection_dim = 64 # Transformer projection dimension. Default value is 64. + transformer_mlp_head_units = [128, 64] # Transformer Multilayer Perceptron (MLP) head units. Default value is [128, 64] + transformer_layers = 8 # transformer layers. Default value is 8. + transformer_num_heads = 4 # Transformer number of heads. Default value is 4. + transformer_cnn_first = True # We have two types of vision transformers: either the CNN is applied first, followed by the transformer, or reversed. @ex.automain -def run(_config, n_classes, n_epochs, input_height, - input_width, weight_decay, weighted_loss, - index_start, dir_of_start_model, is_loss_soft_dice, - n_batch, patches, augmentation, flip_aug, - blur_aug, padding_white, padding_black, scaling, shifting, degrading,channels_shuffling, - brightening, binarization, adding_rgb_background, adding_rgb_foreground, add_red_textlines, blur_k, scales, degrade_scales,shuffle_indexes, - brightness, dir_train, data_is_provided, scaling_bluring, - scaling_brightness, scaling_binarization, rotation, rotation_not_90, - thetha, scaling_flip, continue_training, transformer_projection_dim, - transformer_mlp_head_units, transformer_layers, transformer_num_heads, transformer_cnn_first, - transformer_patchsize_x, transformer_patchsize_y, - transformer_num_patches_xy, backbone_type, save_interval, flip_index, dir_eval, dir_output, - pretraining, learning_rate, task, f1_threshold_classification, classification_classes_name, dir_img_bin, number_of_backgrounds_per_image,dir_rgb_backgrounds, dir_rgb_foregrounds): +def run(_config, + _log, + task, + pretraining, + data_is_provided, + dir_train, + dir_eval, + dir_output, + n_classes, + n_epochs, + n_batch, + input_height, + input_width, + is_loss_soft_dice, + weighted_loss, + weight_decay, + learning_rate, + continue_training, + dir_of_start_model, + save_interval, + augmentation, + thetha, + backbone_type, + transformer_projection_dim, + transformer_mlp_head_units, + transformer_layers, + transformer_num_heads, + transformer_cnn_first, + transformer_patchsize_x, + transformer_patchsize_y, + transformer_num_patches_xy, + f1_threshold_classification, + classification_classes_name, +): if pretraining and not os.path.isfile(RESNET50_WEIGHTS_PATH): - print("downloading RESNET50 pretrained weights to", RESNET50_WEIGHTS_PATH) + _log.info("downloading RESNET50 pretrained weights to %s", RESNET50_WEIGHTS_PATH) download_file(RESNET50_WEIGHTS_URL, RESNET50_WEIGHTS_PATH) - - if dir_rgb_backgrounds: - list_all_possible_background_images = os.listdir(dir_rgb_backgrounds) - else: - list_all_possible_background_images = None - - if dir_rgb_foregrounds: - list_all_possible_foreground_rgbs = os.listdir(dir_rgb_foregrounds) - else: - list_all_possible_foreground_rgbs = None - + + # set the gpu configuration + configuration() + if task in ["segmentation", "enhancement", "binarization"]: - if data_is_provided: - dir_train_flowing = os.path.join(dir_output, 'train') - dir_eval_flowing = os.path.join(dir_output, 'eval') - - dir_flow_train_imgs = os.path.join(dir_train_flowing, 'images') - dir_flow_train_labels = os.path.join(dir_train_flowing, 'labels') + dir_train_flowing = os.path.join(dir_output, 'train') + dir_eval_flowing = os.path.join(dir_output, 'eval') - dir_flow_eval_imgs = os.path.join(dir_eval_flowing, 'images') - dir_flow_eval_labels = os.path.join(dir_eval_flowing, 'labels') + dir_flow_train_imgs = os.path.join(dir_train_flowing, 'images') + dir_flow_train_labels = os.path.join(dir_train_flowing, 'labels') - configuration() - - else: - dir_img, dir_seg = get_dirs_or_files(dir_train) - dir_img_val, dir_seg_val = get_dirs_or_files(dir_eval) - - # make first a directory in output for both training and evaluations in order to flow data from these directories. - dir_train_flowing = os.path.join(dir_output, 'train') - dir_eval_flowing = os.path.join(dir_output, 'eval') - - dir_flow_train_imgs = os.path.join(dir_train_flowing, 'images/') - dir_flow_train_labels = os.path.join(dir_train_flowing, 'labels/') - - dir_flow_eval_imgs = os.path.join(dir_eval_flowing, 'images/') - dir_flow_eval_labels = os.path.join(dir_eval_flowing, 'labels/') + dir_flow_eval_imgs = os.path.join(dir_eval_flowing, 'images') + dir_flow_eval_labels = os.path.join(dir_eval_flowing, 'labels') + if not data_is_provided: + # first create a directory in output for both training and evaluations + # in order to flow data from these directories. if os.path.isdir(dir_train_flowing): os.system('rm -rf ' + dir_train_flowing) - os.makedirs(dir_train_flowing) - else: - os.makedirs(dir_train_flowing) + os.makedirs(dir_train_flowing) if os.path.isdir(dir_eval_flowing): os.system('rm -rf ' + dir_eval_flowing) - os.makedirs(dir_eval_flowing) - else: - os.makedirs(dir_eval_flowing) + os.makedirs(dir_eval_flowing) os.mkdir(dir_flow_train_imgs) os.mkdir(dir_flow_train_labels) From 29a0f19cee579665d5edfaa8b3d2bbc8e3bb31b0 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 28 Jan 2026 13:53:11 +0100 Subject: [PATCH 56/91] =?UTF-8?q?training:=20simplify=20image=20preprocess?= =?UTF-8?q?ing=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - `utils.provide_patches`: split up loop into * `utils.preprocess_img` (single img function) * `utils.preprocess_imgs` (top-level loop) - capture exceptions for all cases (not just some) at top level and with informative logging - avoid repeating / delegating config keys in several places: only as kwargs to `preprocess_img()` - read files into memory only once, then re-use - improve readability (avoiding long lines, repeated code) --- src/eynollah/training/train.py | 81 ++-- src/eynollah/training/utils.py | 799 ++++++++++++++++++++------------- 2 files changed, 510 insertions(+), 370 deletions(-) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index e93281a..9c638ea 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -26,7 +26,7 @@ from eynollah.training.utils import ( generate_data_from_folder_evaluation, generate_data_from_folder_training, get_one_hot, - provide_patches, + preprocess_imgs, return_number_of_total_training_data ) @@ -240,9 +240,9 @@ def run(_config, os.mkdir(dir_flow_eval_imgs) os.mkdir(dir_flow_eval_labels) - # set the gpu configuration - configuration() - + dir_img, dir_seg = get_dirs_or_files(dir_train) + dir_img_val, dir_seg_val = get_dirs_or_files(dir_eval) + imgs_list=np.array(os.listdir(dir_img)) segs_list=np.array(os.listdir(dir_seg)) @@ -250,50 +250,21 @@ def run(_config, segs_list_test=np.array(os.listdir(dir_seg_val)) # writing patches into a sub-folder in order to be flowed from directory. - common_args = [input_height, input_width, - blur_k, blur_aug, - padding_white, padding_black, - flip_aug, binarization, - adding_rgb_background, - adding_rgb_foreground, - add_red_textlines, - channels_shuffling, - scaling, shifting, degrading, brightening, - scales, degrade_scales, brightness, - flip_index, shuffle_indexes, - scaling_bluring, scaling_brightness, scaling_binarization, - rotation, rotation_not_90, thetha, - scaling_flip, task, - ] - common_kwargs = dict(patches= - patches, - dir_img_bin= - dir_img_bin, - number_of_backgrounds_per_image= - number_of_backgrounds_per_image, - list_all_possible_background_images= - list_all_possible_background_images, - dir_rgb_backgrounds= - dir_rgb_backgrounds, - dir_rgb_foregrounds= - dir_rgb_foregrounds, - list_all_possible_foreground_rgbs= - list_all_possible_foreground_rgbs, - ) - provide_patches(imgs_list, segs_list, - dir_img, dir_seg, + preprocess_imgs(_config, + imgs_list, + segs_list, + dir_img, + dir_seg, dir_flow_train_imgs, - dir_flow_train_labels, - *common_args, - augmentation=augmentation, - **common_kwargs) - provide_patches(imgs_list_test, segs_list_test, - dir_img_val, dir_seg_val, + dir_flow_train_labels) + preprocess_imgs(_config, + imgs_list_test, + segs_list_test, + dir_img_val, + dir_seg_val, dir_flow_eval_imgs, dir_flow_eval_labels, - *common_args, - augmentation=False, - **common_kwargs) + augmentation=False) if weighted_loss: weights = np.zeros(n_classes) @@ -307,8 +278,8 @@ def run(_config, label_obj = cv2.imread(label_file) label_obj_one_hot = get_one_hot(label_obj, label_obj.shape[0], label_obj.shape[1], n_classes) weights += (label_obj_one_hot.sum(axis=0)).sum(axis=0) - except Exception as e: - print("error reading data file '%s': %s" % (label_file, e), file=sys.stderr) + except Exception: + _log.exception("error reading data file '%s'", label_file) weights = 1.00 / weights weights = weights / float(np.sum(weights)) @@ -340,7 +311,6 @@ def run(_config, custom_objects = {"PatchEncoder": PatchEncoder, "Patches": Patches}) else: - index_start = 0 if backbone_type == 'nontransformer': model = resnet50_unet(n_classes, input_height, @@ -391,7 +361,7 @@ def run(_config, pretraining) #if you want to see the model structure just uncomment model summary. - model.summary() + #model.summary() if task in ["segmentation", "binarization"]: if is_loss_soft_dice: @@ -423,7 +393,12 @@ def run(_config, SaveWeightsAfterSteps(0, dir_output, _config)] if save_interval: callbacks.append(SaveWeightsAfterSteps(save_interval, dir_output, _config)) - + + _log.info("training on %d batches in %d epochs", + len(os.listdir(dir_flow_train_imgs)) // n_batch - 1, + n_epochs) + _log.info("validating on %d batches", + len(os.listdir(dir_flow_eval_imgs)) // n_batch - 1) model.fit( train_gen, steps_per_epoch=len(os.listdir(dir_flow_train_imgs)) // n_batch - 1, @@ -439,7 +414,6 @@ def run(_config, #model.save(dir_output+'/'+'model'+'.h5') elif task=='classification': - configuration() model = resnet50_classifier(n_classes, input_height, input_width, @@ -474,7 +448,7 @@ def run(_config, usable_checkpoints = np.flatnonzero(np.array(history['val_f1']) > f1_threshold_classification) if len(usable_checkpoints) >= 1: - print("averaging over usable checkpoints", usable_checkpoints) + _log.info("averaging over usable checkpoints: %s", str(usable_checkpoints)) all_weights = [] for epoch in usable_checkpoints: cp_path = os.path.join(dir_output, 'model_{epoch:02d}'.format(epoch=epoch)) @@ -495,10 +469,9 @@ def run(_config, model.save(cp_path) with open(os.path.join(cp_path, "config.json"), "w") as fp: json.dump(_config, fp) # encode dict into JSON - print("ensemble model saved under", cp_path) + _log.info("ensemble model saved under '%s'", cp_path) elif task=='reading_order': - configuration() model = machine_based_reading_order_model( n_classes, input_height, input_width, weight_decay, pretraining) diff --git a/src/eynollah/training/utils.py b/src/eynollah/training/utils.py index 1278be5..61b2536 100644 --- a/src/eynollah/training/utils.py +++ b/src/eynollah/training/utils.py @@ -1,6 +1,7 @@ import os import math import random +from logging import getLogger import cv2 import numpy as np @@ -266,8 +267,9 @@ def generate_data_from_folder_training(path_classes, batchsize, height, width, n ret_y= np.zeros((batchsize, n_classes)).astype(np.int16) batchcount = 0 -def do_brightening(img_in_dir, factor): - im = Image.open(img_in_dir) +def do_brightening(img, factor): + img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) + im = Image.fromarray(img_rgb) enhancer = ImageEnhance.Brightness(im) out_img = enhancer.enhance(factor) out_img = out_img.convert('RGB') @@ -737,321 +739,486 @@ def get_patches_num_scale_new(dir_img_f, dir_seg_f, img, label, height, width, i return indexer -def provide_patches(imgs_list_train, segs_list_train, dir_img, dir_seg, dir_flow_train_imgs, - dir_flow_train_labels, input_height, input_width, blur_k, blur_aug, - padding_white, padding_black, flip_aug, binarization, adding_rgb_background, adding_rgb_foreground, add_red_textlines, channels_shuffling, scaling, shifting, degrading, - brightening, scales, degrade_scales, brightness, flip_index, shuffle_indexes, - scaling_bluring, scaling_brightness, scaling_binarization, rotation, - rotation_not_90, thetha, scaling_flip, task, augmentation=False, patches=False, dir_img_bin=None,number_of_backgrounds_per_image=None,list_all_possible_background_images=None, dir_rgb_backgrounds=None, dir_rgb_foregrounds=None, list_all_possible_foreground_rgbs=None): - +def preprocess_imgs(config, + imgs_list, + segs_list, + dir_img, + dir_seg, + dir_flow_imgs, + dir_flow_labels, + logger=None, + **kwargs, +): + if logger is None: + logger = getLogger('') + + # make a copy for this run + config = dict(config) + # add derived keys not part of config + if config.get('dir_rgb_backgrounds', None): + config['list_all_possible_background_images'] = \ + os.listdir(config['dir_rgb_backgrounds']) + if config.get('dir_rgb_foregrounds', None): + config['list_all_possible_foreground_rgbs'] = \ + os.listdir(config['dir_rgb_foregrounds']) + # override keys from call + config.update(kwargs) + indexer = 0 - for im, seg_i in tqdm(zip(imgs_list_train, segs_list_train)): + for im, seg_i in tqdm(zip(imgs_list, segs_list)): + img = cv2.imread(os.path.join(dir_img, im)) img_name = os.path.splitext(im)[0] - if task == "segmentation" or task == "binarization": - dir_of_label_file = os.path.join(dir_seg, img_name + '.png') - elif task=="enhancement": - dir_of_label_file = os.path.join(dir_seg, im) - - if not patches: - cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', resize_image(cv2.imread(dir_img + '/' + im), input_height, input_width)) - cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', resize_image(cv2.imread(dir_of_label_file), input_height, input_width)) - indexer += 1 - - if augmentation: - if flip_aug: - for f_i in flip_index: - cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', - resize_image(cv2.flip(cv2.imread(dir_img+'/'+im),f_i),input_height,input_width) ) - - cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', - resize_image(cv2.flip(cv2.imread(dir_of_label_file), f_i), input_height, input_width)) - indexer += 1 - - if blur_aug: - for blur_i in blur_k: - cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', - (resize_image(bluring(cv2.imread(dir_img + '/' + im), blur_i), input_height, input_width))) - - cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', - resize_image(cv2.imread(dir_of_label_file), input_height, input_width)) - indexer += 1 - if brightening: - for factor in brightness: - try: - cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', - (resize_image(do_brightening(dir_img + '/' +im, factor), input_height, input_width))) - - cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', - resize_image(cv2.imread(dir_of_label_file), input_height, input_width)) - indexer += 1 - except: - pass - - if binarization: - - if dir_img_bin: - img_bin_corr = cv2.imread(dir_img_bin + '/' + img_name+'.png') - - cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', - resize_image(img_bin_corr, input_height, input_width)) - else: - cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', - resize_image(otsu_copy(cv2.imread(dir_img + '/' + im)), input_height, input_width)) - - cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', - resize_image(cv2.imread(dir_of_label_file), input_height, input_width)) - indexer += 1 - - if degrading: - for degrade_scale_ind in degrade_scales: - cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', - (resize_image(do_degrading(cv2.imread(dir_img + '/' + im), degrade_scale_ind), input_height, input_width))) - - cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', - resize_image(cv2.imread(dir_of_label_file), input_height, input_width)) - indexer += 1 - - if rotation_not_90: - for thetha_i in thetha: - img_max_rotated, label_max_rotated = rotation_not_90_func(cv2.imread(dir_img + '/'+im), - cv2.imread(dir_of_label_file), thetha_i) - - cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', resize_image(img_max_rotated, input_height, input_width)) - - cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', resize_image(label_max_rotated, input_height, input_width)) - indexer += 1 - - if channels_shuffling: - for shuffle_index in shuffle_indexes: - cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', - (resize_image(return_shuffled_channels(cv2.imread(dir_img + '/' + im), shuffle_index), input_height, input_width))) - - cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', - resize_image(cv2.imread(dir_of_label_file), input_height, input_width)) - indexer += 1 - - if scaling: - for sc_ind in scales: - img_scaled, label_scaled = scale_image_for_no_patch(cv2.imread(dir_img + '/'+im), - cv2.imread(dir_of_label_file), sc_ind) - - cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', resize_image(img_scaled, input_height, input_width)) - cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', resize_image(label_scaled, input_height, input_width)) - indexer += 1 - if shifting: - shift_types = ['xpos', 'xmin', 'ypos', 'ymin', 'xypos', 'xymin'] - for st_ind in shift_types: - img_shifted, label_shifted = shift_image_and_label(cv2.imread(dir_img + '/'+im), - cv2.imread(dir_of_label_file), st_ind) - - cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', resize_image(img_shifted, input_height, input_width)) - cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', resize_image(label_shifted, input_height, input_width)) - indexer += 1 - - - if adding_rgb_background: - img_bin_corr = cv2.imread(dir_img_bin + '/' + img_name+'.png') - for i_n in range(number_of_backgrounds_per_image): - background_image_chosen_name = random.choice(list_all_possible_background_images) - img_rgb_background_chosen = cv2.imread(dir_rgb_backgrounds + '/' + background_image_chosen_name) - img_with_overlayed_background = return_binary_image_with_given_rgb_background(img_bin_corr, img_rgb_background_chosen) - - cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', resize_image(img_with_overlayed_background, input_height, input_width)) - cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', - resize_image(cv2.imread(dir_of_label_file), input_height, input_width)) - - indexer += 1 - - if adding_rgb_foreground: - img_bin_corr = cv2.imread(dir_img_bin + '/' + img_name+'.png') - for i_n in range(number_of_backgrounds_per_image): - background_image_chosen_name = random.choice(list_all_possible_background_images) - foreground_rgb_chosen_name = random.choice(list_all_possible_foreground_rgbs) - - img_rgb_background_chosen = cv2.imread(dir_rgb_backgrounds + '/' + background_image_chosen_name) - foreground_rgb_chosen = np.load(dir_rgb_foregrounds + '/' + foreground_rgb_chosen_name) - - img_with_overlayed_background = return_binary_image_with_given_rgb_background_and_given_foreground_rgb(img_bin_corr, img_rgb_background_chosen, foreground_rgb_chosen) - - cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', resize_image(img_with_overlayed_background, input_height, input_width)) - cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', - resize_image(cv2.imread(dir_of_label_file), input_height, input_width)) - - indexer += 1 - - if add_red_textlines: - img_bin_corr = cv2.imread(dir_img_bin + '/' + img_name+'.png') - img_red_context = return_image_with_red_elements(cv2.imread(dir_img + '/'+im), img_bin_corr) - - cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', resize_image(img_red_context, input_height, input_width)) - cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', - resize_image(cv2.imread(dir_of_label_file), input_height, input_width)) - - indexer += 1 - - - - - if patches: - indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, - cv2.imread(dir_img + '/' + im), cv2.imread(dir_of_label_file), - input_height, input_width, indexer=indexer) - - if augmentation: - if rotation: - indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, - rotation_90(cv2.imread(dir_img + '/' + im)), - rotation_90(cv2.imread(dir_of_label_file)), - input_height, input_width, indexer=indexer) - - if rotation_not_90: - for thetha_i in thetha: - img_max_rotated, label_max_rotated = rotation_not_90_func(cv2.imread(dir_img + '/'+im), - cv2.imread(dir_of_label_file), thetha_i) - indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, - img_max_rotated, - label_max_rotated, - input_height, input_width, indexer=indexer) - - if channels_shuffling: - for shuffle_index in shuffle_indexes: - indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, - return_shuffled_channels(cv2.imread(dir_img + '/' + im), shuffle_index), - cv2.imread(dir_of_label_file), - input_height, input_width, indexer=indexer) - - if adding_rgb_background: - img_bin_corr = cv2.imread(dir_img_bin + '/' + img_name+'.png') - for i_n in range(number_of_backgrounds_per_image): - background_image_chosen_name = random.choice(list_all_possible_background_images) - img_rgb_background_chosen = cv2.imread(dir_rgb_backgrounds + '/' + background_image_chosen_name) - img_with_overlayed_background = return_binary_image_with_given_rgb_background(img_bin_corr, img_rgb_background_chosen) - - indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, - img_with_overlayed_background, - cv2.imread(dir_of_label_file), - input_height, input_width, indexer=indexer) - - - if adding_rgb_foreground: - img_bin_corr = cv2.imread(dir_img_bin + '/' + img_name+'.png') - for i_n in range(number_of_backgrounds_per_image): - background_image_chosen_name = random.choice(list_all_possible_background_images) - foreground_rgb_chosen_name = random.choice(list_all_possible_foreground_rgbs) - - img_rgb_background_chosen = cv2.imread(dir_rgb_backgrounds + '/' + background_image_chosen_name) - foreground_rgb_chosen = np.load(dir_rgb_foregrounds + '/' + foreground_rgb_chosen_name) - - img_with_overlayed_background = return_binary_image_with_given_rgb_background_and_given_foreground_rgb(img_bin_corr, img_rgb_background_chosen, foreground_rgb_chosen) - - indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, - img_with_overlayed_background, - cv2.imread(dir_of_label_file), - input_height, input_width, indexer=indexer) - - - if add_red_textlines: - img_bin_corr = cv2.imread(dir_img_bin + '/' + img_name+'.png') - img_red_context = return_image_with_red_elements(cv2.imread(dir_img + '/'+im), img_bin_corr) - - indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, - img_red_context, - cv2.imread(dir_of_label_file), - input_height, input_width, indexer=indexer) - - if flip_aug: - for f_i in flip_index: - indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, - cv2.flip(cv2.imread(dir_img + '/' + im), f_i), - cv2.flip(cv2.imread(dir_of_label_file), f_i), - input_height, input_width, indexer=indexer) - if blur_aug: - for blur_i in blur_k: - indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, - bluring(cv2.imread(dir_img + '/' + im), blur_i), - cv2.imread(dir_of_label_file), - input_height, input_width, indexer=indexer) - if padding_black: - indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, - do_padding_black(cv2.imread(dir_img + '/' + im)), - do_padding_label(cv2.imread(dir_of_label_file)), - input_height, input_width, indexer=indexer) - - if padding_white: - indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, - do_padding_white(cv2.imread(dir_img + '/'+im)), - do_padding_label(cv2.imread(dir_of_label_file)), - input_height, input_width, indexer=indexer) - - if brightening: - for factor in brightness: - try: - indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, - do_brightening(dir_img + '/' +im, factor), - cv2.imread(dir_of_label_file), - input_height, input_width, indexer=indexer) - except: - pass - if scaling: - for sc_ind in scales: - indexer = get_patches_num_scale_new(dir_flow_train_imgs, dir_flow_train_labels, - cv2.imread(dir_img + '/' + im) , - cv2.imread(dir_of_label_file), - input_height, input_width, indexer=indexer, scaler=sc_ind) - - if degrading: - for degrade_scale_ind in degrade_scales: - indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, - do_degrading(cv2.imread(dir_img + '/' + im), degrade_scale_ind), - cv2.imread(dir_of_label_file), - input_height, input_width, indexer=indexer) - - if binarization: - if dir_img_bin: - img_bin_corr = cv2.imread(dir_img_bin + '/' + img_name+'.png') - - indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, - img_bin_corr, - cv2.imread(dir_of_label_file), - input_height, input_width, indexer=indexer) - - else: - indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, - otsu_copy(cv2.imread(dir_img + '/' + im)), - cv2.imread(dir_of_label_file), - input_height, input_width, indexer=indexer) + if config['task'] in ["segmentation", "binarization"]: + lab = cv2.imread(os.path.join(dir_seg, img_name + '.png')) + elif config['task'] == "enhancement": + lab = cv2.imread(os.path.join(dir_seg, im)) + else: + lab = None - if scaling_brightness: - for sc_ind in scales: - for factor in brightness: - try: - indexer = get_patches_num_scale_new(dir_flow_train_imgs, - dir_flow_train_labels, - do_brightening(dir_img + '/' + im, factor) - ,cv2.imread(dir_of_label_file) - ,input_height, input_width, indexer=indexer, scaler=sc_ind) - except: - pass - - if scaling_bluring: - for sc_ind in scales: - for blur_i in blur_k: - indexer = get_patches_num_scale_new(dir_flow_train_imgs, dir_flow_train_labels, - bluring(cv2.imread(dir_img + '/' + im), blur_i), - cv2.imread(dir_of_label_file), - input_height, input_width, indexer=indexer, scaler=sc_ind) + try: + indexer = preprocess_img(indexer, img, img_name, lab, + dir_flow_imgs, + dir_flow_labels, + **config) - if scaling_binarization: - for sc_ind in scales: - indexer = get_patches_num_scale_new(dir_flow_train_imgs, dir_flow_train_labels, - otsu_copy(cv2.imread(dir_img + '/' + im)), - cv2.imread(dir_of_label_file), - input_height, input_width, indexer=indexer, scaler=sc_ind) - - if scaling_flip: - for sc_ind in scales: - for f_i in flip_index: - indexer = get_patches_num_scale_new(dir_flow_train_imgs, dir_flow_train_labels, - cv2.flip( cv2.imread(dir_img + '/' + im), f_i), - cv2.flip(cv2.imread(dir_of_label_file), f_i), - input_height, input_width, indexer=indexer, scaler=sc_ind) + except: + logger.exception("skipping image %s", img_name) + +def preprocess_img(indexer, + img, + img_name, + lab, + dir_flow_train_imgs, + dir_flow_train_labels, + input_height=None, + input_width=None, + augmentation=False, + flip_aug=False, + flip_index=None, + blur_aug=False, + blur_k=None, + padding_white=False, + padding_black=False, + scaling=False, + scaling_bluring=False, + scaling_brightness=False, + scaling_binarization=False, + scaling_flip=False, + scales=None, + shifting=False, + degrading=False, + degrade_scales=None, + brightening=False, + brightness=None, + binarization=False, + dir_img_bin=None, + add_red_textlines=False, + adding_rgb_background=False, + dir_rgb_backgrounds=None, + adding_rgb_foreground=False, + dir_rgb_foregrounds=None, + number_of_backgrounds_per_image=None, + channels_shuffling=False, + shuffle_indexes=None, + rotation=False, + rotation_not_90=False, + thetha=None, + patches=False, + list_all_possible_background_images=None, + list_all_possible_foreground_rgbs=None, + **kwargs, +): + if not patches: + cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', + resize_image(img, + input_height, + input_width)) + cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', + resize_image(lab, + input_height, + input_width)) + indexer += 1 + if augmentation: + if flip_aug: + for f_i in flip_index: + cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', + resize_image(cv2.flip(img, f_i), + input_height, + input_width)) + cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', + resize_image(cv2.flip(lab, f_i), + input_height, + input_width)) + indexer += 1 + if blur_aug: + for blur_i in blur_k: + cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', + (resize_image(bluring(img, blur_i), + input_height, + input_width))) + cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', + resize_image(lab, + input_height, + input_width)) + indexer += 1 + if brightening: + for factor in brightness: + cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', + (resize_image(do_brightening(img, factor), + input_height, + input_width))) + cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', + resize_image(lab, + input_height, + input_width)) + indexer += 1 + if binarization: + if dir_img_bin: + img_bin_corr = cv2.imread(dir_img_bin + '/' + img_name+'.png') + cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', + resize_image(img_bin_corr, + input_height, + input_width)) + else: + cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', + resize_image(otsu_copy(img), + input_height, + input_width)) + cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', + resize_image(lab, + input_height, + input_width)) + indexer += 1 + if degrading: + for degrade_scale_ind in degrade_scales: + cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', + (resize_image(do_degrading(img, degrade_scale_ind), + input_height, + input_width))) + cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', + resize_image(lab, + input_height, + input_width)) + indexer += 1 + if rotation_not_90: + for thetha_i in thetha: + img_max_rotated, label_max_rotated = \ + rotation_not_90_func(img, lab, thetha_i) + cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', + resize_image(img_max_rotated, + input_height, + input_width)) + cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', + resize_image(label_max_rotated, + input_height, + input_width)) + indexer += 1 + if channels_shuffling: + for shuffle_index in shuffle_indexes: + cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', + (resize_image(return_shuffled_channels(img, shuffle_index), + input_height, + input_width))) + cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', + resize_image(lab, + input_height, + input_width)) + indexer += 1 + if scaling: + for sc_ind in scales: + img_scaled, label_scaled = \ + scale_image_for_no_patch(img, lab, sc_ind) + cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', + resize_image(img_scaled, + input_height, + input_width)) + cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', + resize_image(label_scaled, + input_height, + input_width)) + indexer += 1 + if shifting: + shift_types = ['xpos', 'xmin', 'ypos', 'ymin', 'xypos', 'xymin'] + for st_ind in shift_types: + img_shifted, label_shifted = \ + shift_image_and_label(img, lab, st_ind) + cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', + resize_image(img_shifted, + input_height, + input_width)) + cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', + resize_image(label_shifted, + input_height, + input_width)) + indexer += 1 + if adding_rgb_background: + img_bin_corr = cv2.imread(dir_img_bin + '/' + img_name+'.png') + for i_n in range(number_of_backgrounds_per_image): + background_image_chosen_name = random.choice(list_all_possible_background_images) + img_rgb_background_chosen = \ + cv2.imread(dir_rgb_backgrounds + '/' + background_image_chosen_name) + img_with_overlayed_background = \ + return_binary_image_with_given_rgb_background( + img_bin_corr, img_rgb_background_chosen) + cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', + resize_image(img_with_overlayed_background, + input_height, + input_width)) + cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', + resize_image(lab, + input_height, + input_width)) + indexer += 1 + if adding_rgb_foreground: + img_bin_corr = cv2.imread(dir_img_bin + '/' + img_name+'.png') + for i_n in range(number_of_backgrounds_per_image): + background_image_chosen_name = random.choice(list_all_possible_background_images) + foreground_rgb_chosen_name = random.choice(list_all_possible_foreground_rgbs) + img_rgb_background_chosen = \ + cv2.imread(dir_rgb_backgrounds + '/' + background_image_chosen_name) + foreground_rgb_chosen = \ + np.load(dir_rgb_foregrounds + '/' + foreground_rgb_chosen_name) + img_with_overlayed_background = \ + return_binary_image_with_given_rgb_background_and_given_foreground_rgb( + img_bin_corr, img_rgb_background_chosen, foreground_rgb_chosen) + cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', + resize_image(img_with_overlayed_background, + input_height, + input_width)) + cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', + resize_image(lab, + input_height, + input_width)) + indexer += 1 + if add_red_textlines: + img_bin_corr = cv2.imread(dir_img_bin + '/' + img_name+'.png') + img_red_context = \ + return_image_with_red_elements(img, img_bin_corr) + cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', + resize_image(img_red_context, + input_height, + input_width)) + cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', + resize_image(lab, + input_height, + input_width)) + indexer += 1 + else: + indexer = get_patches(dir_flow_train_imgs, + dir_flow_train_labels, + img, + lab, + input_height, + input_width, + indexer=indexer) + if augmentation: + if rotation: + indexer = get_patches(dir_flow_train_imgs, + dir_flow_train_labels, + rotation_90(img), + rotation_90(lab), + input_height, + input_width, + indexer=indexer) + if rotation_not_90: + for thetha_i in thetha: + img_max_rotated, label_max_rotated = \ + rotation_not_90_func(img, lab, thetha_i) + indexer = get_patches(dir_flow_train_imgs, + dir_flow_train_labels, + img_max_rotated, + label_max_rotated, + input_height, + input_width, + indexer=indexer) + if channels_shuffling: + for shuffle_index in shuffle_indexes: + img_shuffled = \ + return_shuffled_channels(img, shuffle_index), + indexer = get_patches(dir_flow_train_imgs, + dir_flow_train_labels, + img_shuffled, + lab, + input_height, + input_width, + indexer=indexer) + if adding_rgb_background: + img_bin_corr = cv2.imread(dir_img_bin + '/' + img_name+'.png') + for i_n in range(number_of_backgrounds_per_image): + background_image_chosen_name = random.choice(list_all_possible_background_images) + img_rgb_background_chosen = \ + cv2.imread(dir_rgb_backgrounds + '/' + background_image_chosen_name) + img_with_overlayed_background = \ + return_binary_image_with_given_rgb_background( + img_bin_corr, img_rgb_background_chosen) + indexer = get_patches(dir_flow_train_imgs, + dir_flow_train_labels, + img_with_overlayed_background, + lab, + input_height, + input_width, + indexer=indexer) + if adding_rgb_foreground: + img_bin_corr = cv2.imread(dir_img_bin + '/' + img_name+'.png') + for i_n in range(number_of_backgrounds_per_image): + background_image_chosen_name = random.choice(list_all_possible_background_images) + foreground_rgb_chosen_name = random.choice(list_all_possible_foreground_rgbs) + img_rgb_background_chosen = \ + cv2.imread(dir_rgb_backgrounds + '/' + background_image_chosen_name) + foreground_rgb_chosen = \ + np.load(dir_rgb_foregrounds + '/' + foreground_rgb_chosen_name) + img_with_overlayed_background = \ + return_binary_image_with_given_rgb_background_and_given_foreground_rgb( + img_bin_corr, img_rgb_background_chosen, foreground_rgb_chosen) + indexer = get_patches(dir_flow_train_imgs, + dir_flow_train_labels, + img_with_overlayed_background, + lab, + input_height, + input_width, + indexer=indexer) + if add_red_textlines: + img_bin_corr = cv2.imread(dir_img_bin + '/' + img_name+'.png') + img_red_context = \ + return_image_with_red_elements(img, img_bin_corr) + indexer = get_patches(dir_flow_train_imgs, + dir_flow_train_labels, + img_red_context, + lab, + input_height, + input_width, + indexer=indexer) + if flip_aug: + for f_i in flip_index: + indexer = get_patches(dir_flow_train_imgs, + dir_flow_train_labels, + cv2.flip(img, f_i), + cv2.flip(lab, f_i), + input_height, + input_width, + indexer=indexer) + if blur_aug: + for blur_i in blur_k: + indexer = get_patches(dir_flow_train_imgs, + dir_flow_train_labels, + bluring(img, blur_i), + lab, + input_height, + input_width, + indexer=indexer) + if padding_black: + indexer = get_patches(dir_flow_train_imgs, + dir_flow_train_labels, + do_padding_black(img), + do_padding_label(lab), + input_height, + input_width, + indexer=indexer) + if padding_white: + indexer = get_patches(dir_flow_train_imgs, + dir_flow_train_labels, + do_padding_white(img), + do_padding_label(lab), + input_height, + input_width, + indexer=indexer) + if brightening: + for factor in brightness: + indexer = get_patches(dir_flow_train_imgs, + dir_flow_train_labels, + do_brightening(img, factor), + lab, + input_height, + input_width, + indexer=indexer) + if scaling: + for sc_ind in scales: + indexer = get_patches_num_scale_new( + dir_flow_train_imgs, + dir_flow_train_labels, + img , + lab, + input_height, + input_width, + indexer=indexer, + scaler=sc_ind) + if degrading: + for degrade_scale_ind in degrade_scales: + img_deg = \ + do_degrading(img, degrade_scale_ind), + indexer = get_patches(dir_flow_train_imgs, + dir_flow_train_labels, + img_deg, + lab, + input_height, + input_width, + indexer=indexer) + if binarization: + if dir_img_bin: + img_bin_corr = cv2.imread(dir_img_bin + '/' + img_name+'.png') + indexer = get_patches(dir_flow_train_imgs, + dir_flow_train_labels, + img_bin_corr, + lab, + input_height, + input_width, + indexer=indexer) + else: + indexer = get_patches(dir_flow_train_imgs, + dir_flow_train_labels, + otsu_copy(img), + lab, + input_height, + input_width, + indexer=indexer) + if scaling_brightness: + for sc_ind in scales: + for factor in brightness: + img_bright = do_brightening(img, factor) + indexer = get_patches_num_scale_new( + dir_flow_train_imgs, + dir_flow_train_labels, + img_bright, + lab, + input_height, + input_width, + indexer=indexer, + scaler=sc_ind) + if scaling_bluring: + for sc_ind in scales: + for blur_i in blur_k: + img_blur = bluring(img, blur_i), + indexer = get_patches_num_scale_new( + dir_flow_train_imgs, + dir_flow_train_labels, + img_blur, + lab, + input_height, + input_width, + indexer=indexer, + scaler=sc_ind) + if scaling_binarization: + for sc_ind in scales: + img_bin = otsu_copy(img), + indexer = get_patches_num_scale_new( + dir_flow_train_imgs, + dir_flow_train_labels, + img_bin, + lab, + input_height, + input_width, + indexer=indexer, + scaler=sc_ind) + if scaling_flip: + for sc_ind in scales: + for f_i in flip_index: + indexer = get_patches_num_scale_new( + dir_flow_train_imgs, + dir_flow_train_labels, + cv2.flip(img, f_i), + cv2.flip(lab, f_i), + input_height, + input_width, + indexer=indexer, + scaler=sc_ind) + return indexer From d1e8a02fd4a50d61d3101db8a9ae870201bde194 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 29 Jan 2026 03:01:14 +0100 Subject: [PATCH 57/91] training: fix epoch size calculation --- src/eynollah/training/train.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index 9c638ea..1e2ab3e 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -394,17 +394,16 @@ def run(_config, if save_interval: callbacks.append(SaveWeightsAfterSteps(save_interval, dir_output, _config)) - _log.info("training on %d batches in %d epochs", - len(os.listdir(dir_flow_train_imgs)) // n_batch - 1, - n_epochs) - _log.info("validating on %d batches", - len(os.listdir(dir_flow_eval_imgs)) // n_batch - 1) + steps_train = len(os.listdir(dir_flow_train_imgs)) // n_batch # - 1 + steps_val = len(os.listdir(dir_flow_eval_imgs)) // n_batch + _log.info("training on %d batches in %d epochs", steps_train, n_epochs) + _log.info("validating on %d batches", steps_val) model.fit( train_gen, - steps_per_epoch=len(os.listdir(dir_flow_train_imgs)) // n_batch - 1, + steps_per_epoch=steps_train, validation_data=val_gen, #validation_steps=1, # rs: only one batch?? - validation_steps=len(os.listdir(dir_flow_eval_imgs)) // n_batch - 1, + validation_steps=steps_val, epochs=n_epochs, callbacks=callbacks) From 25153ad307a6ea658dee8d3be19250969530cdfc Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 29 Jan 2026 12:19:09 +0100 Subject: [PATCH 58/91] training: add IoU metric --- src/eynollah/training/train.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index 1e2ab3e..344522a 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -34,6 +34,7 @@ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' os.environ['TF_USE_LEGACY_KERAS'] = '1' # avoid Keras 3 after TF 2.15 import tensorflow as tf from tensorflow.keras.optimizers import SGD, Adam +from tensorflow.keras.metrics import MeanIoU from tensorflow.keras.models import load_model from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard from sacred import Experiment @@ -374,7 +375,11 @@ def run(_config, loss = 'mean_squared_error' model.compile(loss=loss, optimizer=Adam(learning_rate=learning_rate), - metrics=['accuracy']) + metrics=['accuracy', MeanIoU(n_classes, + name='iou', + ignore_class=0, + sparse_y_true=False, + sparse_y_pred=False)]) # generating train and evaluation data gen_kwargs = dict(batch_size=n_batch, From e85003db4a74d2a0b3f830c0338402368cb67d48 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 4 Feb 2026 17:32:24 +0100 Subject: [PATCH 59/91] training: re-instate `index_start`, reflect cfg dependency - `index_start`: re-introduce cfg key, pass to Keras `Model.fit` as `initial_epoch` - make config keys `index_start` and `dir_of_start_model` dependent on `continue_training` - improve description --- src/eynollah/training/train.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index 344522a..de8cccd 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -157,10 +157,12 @@ def config_params(): dir_eval = None # Directory of validation dataset with subdirectories having the names "images" and "labels". dir_output = None # Directory where the augmented training data and the model checkpoints will be saved. pretraining = False # Set to true to (down)load pretrained weights of ResNet50 encoder. - save_interval = None # frequency for writing model checkpoints (nonzero integer for number of batches, or zero for epoch) - continue_training = False # Set to true if you would like to continue training an already trained a model. - dir_of_start_model = '' # Directory containing pretrained encoder to continue training the model. - data_is_provided = False # Only set this to true when you have already provided the input data and the train and eval data are in "dir_output". + save_interval = None # frequency for writing model checkpoints (positive integer for number of batches saved under "model_step_{batch:04d}", otherwise epoch saved under "model_{epoch:02d}") + continue_training = False # Whether to continue training an existing model. + if continue_training: + dir_of_start_model = '' # Directory of model checkpoint to load to continue training. (E.g. if you already trained for 3 epochs, set "dir_of_start_model=dir_output/model_03".) + index_start = 0 # Epoch counter initial value to continue training. (E.g. if you already trained for 3 epochs, set "index_start=3" to continue naming checkpoints model_04, model_05 etc.) + data_is_provided = False # Whether the preprocessed input data (subdirectories "images" and "labels" in both subdirectories "train" and "eval" of "dir_output") has already been generated (in the first epoch of a previous run). if backbone_type == "transformer": transformer_patchsize_x = None # Patch size of vision transformer patches in x direction. transformer_patchsize_y = None # Patch size of vision transformer patches in y direction. @@ -190,6 +192,7 @@ def run(_config, weight_decay, learning_rate, continue_training, + index_start, dir_of_start_model, save_interval, augmentation, @@ -312,6 +315,7 @@ def run(_config, custom_objects = {"PatchEncoder": PatchEncoder, "Patches": Patches}) else: + index_start = 0 if backbone_type == 'nontransformer': model = resnet50_unet(n_classes, input_height, @@ -410,7 +414,8 @@ def run(_config, #validation_steps=1, # rs: only one batch?? validation_steps=steps_val, epochs=n_epochs, - callbacks=callbacks) + callbacks=callbacks, + initial_epoch=index_start) #os.system('rm -rf '+dir_train_flowing) #os.system('rm -rf '+dir_eval_flowing) From 1581094141a2eb8892fa58b09de7fe8500e73e08 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 4 Feb 2026 17:35:12 +0100 Subject: [PATCH 60/91] training: extend `index_start` to tasks classification and RO --- src/eynollah/training/train.py | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index de8cccd..168884a 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -423,11 +423,15 @@ def run(_config, #model.save(dir_output+'/'+'model'+'.h5') elif task=='classification': - model = resnet50_classifier(n_classes, - input_height, - input_width, - weight_decay, - pretraining) + if continue_training: + model = load_model(dir_of_start_model, compile=False) + else: + index_start = 0 + model = resnet50_classifier(n_classes, + input_height, + input_width, + weight_decay, + pretraining) model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), # rs: why not learning_rate? @@ -453,7 +457,8 @@ def run(_config, verbose=1, epochs=n_epochs, metrics=[F1Score(average='macro', name='f1')], - callbacks=callbacks) + callbacks=callbacks, + initial_epoch=index_start) usable_checkpoints = np.flatnonzero(np.array(history['val_f1']) > f1_threshold_classification) if len(usable_checkpoints) >= 1: @@ -481,8 +486,15 @@ def run(_config, _log.info("ensemble model saved under '%s'", cp_path) elif task=='reading_order': - model = machine_based_reading_order_model( - n_classes, input_height, input_width, weight_decay, pretraining) + if continue_training: + model = load_model(dir_of_start_model, compile=False) + else: + index_start = 0 + model = machine_based_reading_order_model(n_classes, + input_height, + input_width, + weight_decay, + pretraining) dir_flow_train_imgs = os.path.join(dir_train, 'images') dir_flow_train_labels = os.path.join(dir_train, 'labels') @@ -495,7 +507,6 @@ def run(_config, #ls_test = os.listdir(dir_flow_train_labels) #f1score_tot = [0] - indexer_start = 0 model.compile(loss="binary_crossentropy", #optimizer=SGD(learning_rate=0.01, momentum=0.9), optimizer=Adam(learning_rate=0.0001), # rs: why not learning_rate? @@ -515,7 +526,8 @@ def run(_config, steps_per_epoch=num_rows / n_batch, verbose=1, epochs=n_epochs, - callbacks=callbacks) + callbacks=callbacks, + initial_epoch=index_start) ''' if f1score>f1score_tot[0]: f1score_tot[0] = f1score From 7562317da5aa8f4a56c981848d23cb5eec7685d2 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 4 Feb 2026 17:35:38 +0100 Subject: [PATCH 61/91] training: fix+simplify `load_model` logic for `continue_training` - add missing combination `transformer` (w/ patch encoder and `weighted_loss`) - add assertion to prevent wrong loss type being configured --- src/eynollah/training/train.py | 36 ++++++++++++---------------------- 1 file changed, 13 insertions(+), 23 deletions(-) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index 168884a..7ede551 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -290,30 +290,20 @@ def run(_config, weights = weights / float(np.min(weights)) weights = weights / float(np.sum(weights)) + if task == "enhancement": + assert not is_loss_soft_dice, "for enhancement, soft_dice loss does not apply" + assert not weighted_dice, "for enhancement, weighted loss does not apply" if continue_training: - if backbone_type == 'nontransformer': - if is_loss_soft_dice and task in ["segmentation", "binarization"]: - model = load_model(dir_of_start_model, compile=True, - custom_objects={'soft_dice_loss': soft_dice_loss}) - elif weighted_loss and task in ["segmentation", "binarization"]: - model = load_model(dir_of_start_model, compile=True, - custom_objects={'loss': weighted_categorical_crossentropy(weights)}) - else: - model = load_model(dir_of_start_model , compile=True) - - elif backbone_type == 'transformer': - if is_loss_soft_dice and task in ["segmentation", "binarization"]: - model = load_model(dir_of_start_model, compile=True, - custom_objects={"PatchEncoder": PatchEncoder, - "Patches": Patches, - 'soft_dice_loss': soft_dice_loss}) - elif weighted_loss and task in ["segmentation", "binarization"]: - model = load_model(dir_of_start_model, compile=True, - custom_objects={'loss': weighted_categorical_crossentropy(weights)}) - else: - model = load_model(dir_of_start_model, compile=True, - custom_objects = {"PatchEncoder": PatchEncoder, - "Patches": Patches}) + custom_objects = dict() + if is_loss_soft_dice: + custom_objects.update(soft_dice_loss=soft_dice_loss) + elif weighted_loss: + custom_objects.update(loss=weighted_categorical_crossentropy(weights)) + if backbone_type == 'transformer': + custom_objects.update(PatchEncoder=PatchEncoder, + Patches=Patches) + model = load_model(dir_of_start_model, compile=False, + custom_objects=custom_objects) else: index_start = 0 if backbone_type == 'nontransformer': From 4a65ee0c672640821ebb54dc647a3e027f21fc46 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 5 Feb 2026 11:53:19 +0100 Subject: [PATCH 62/91] =?UTF-8?q?training.train:=20more=20config=20depende?= =?UTF-8?q?ncies=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - make more config_params keys dependent on each other - re-order accordingly - in main, initialise them (as kwarg), so sacred actually allows overriding them by named config file --- src/eynollah/training/train.py | 67 ++++++++++++++++++---------------- 1 file changed, 36 insertions(+), 31 deletions(-) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index 7ede551..a21a34d 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -97,7 +97,17 @@ ex = Experiment(save_git_info=False) @ex.config def config_params(): task = "segmentation" # This parameter defines task of model which can be segmentation, enhancement or classification. - backbone_type = None # Type of image feature map network backbone. Either a vision transformer alongside a CNN we call "transformer", or only a CNN which we call "nontransformer" + if task in ["segmentation", "binarization", "enhancement"]: + backbone_type = "nontransformer" # Type of image feature map network backbone. Either a vision transformer alongside a CNN we call "transformer", or only a CNN which we call "nontransformer" + if backbone_type == "transformer": + transformer_patchsize_x = None # Patch size of vision transformer patches in x direction. + transformer_patchsize_y = None # Patch size of vision transformer patches in y direction. + transformer_num_patches_xy = None # Number of patches for vision transformer in x and y direction respectively. + transformer_projection_dim = 64 # Transformer projection dimension. Default value is 64. + transformer_mlp_head_units = [128, 64] # Transformer Multilayer Perceptron (MLP) head units. Default value is [128, 64] + transformer_layers = 8 # transformer layers. Default value is 8. + transformer_num_heads = 4 # Transformer number of heads. Default value is 4. + transformer_cnn_first = True # We have two types of vision transformers: either the CNN is applied first, followed by the transformer, or reversed. n_classes = None # Number of classes. In the case of binary classification this should be 2. n_epochs = 1 # Number of epochs to train. n_batch = 1 # Number of images per batch at each iteration. (Try as large as fits on VRAM.) @@ -105,10 +115,12 @@ def config_params(): input_width = 224 * 1 # Width of model's input in pixels. weight_decay = 1e-6 # Weight decay of l2 regularization of model layers. learning_rate = 1e-4 # Set the learning rate. - is_loss_soft_dice = False # Use soft dice as loss function. When set to true, "weighted_loss" must be false. - weighted_loss = False # Use weighted categorical cross entropy as loss fucntion. When set to true, "is_loss_soft_dice" must be false. - f1_threshold_classification = None # This threshold is used to consider models with an evaluation f1 scores bigger than it. The selected model weights undergo a weights ensembling. And avreage ensembled model will be written to output. - classification_classes_name = None # Dictionary of classification classes names. + if task in ["segmentation", "binarization"]: + is_loss_soft_dice = False # Use soft dice as loss function. When set to true, "weighted_loss" must be false. + weighted_loss = False # Use weighted categorical cross entropy as loss fucntion. When set to true, "is_loss_soft_dice" must be false. + elif task == "classification": + f1_threshold_classification = None # This threshold is used to consider models with an evaluation f1 scores bigger than it. The selected model weights undergo a weights ensembling. And avreage ensembled model will be written to output. + classification_classes_name = None # Dictionary of classification classes names. patches = False # Divides input image into smaller patches (input size of the model) when set to true. For the model to see the full image, like page extraction, set this to false. augmentation = False # To apply any kind of augmentation, this parameter must be set to true. if augmentation: @@ -163,17 +175,8 @@ def config_params(): dir_of_start_model = '' # Directory of model checkpoint to load to continue training. (E.g. if you already trained for 3 epochs, set "dir_of_start_model=dir_output/model_03".) index_start = 0 # Epoch counter initial value to continue training. (E.g. if you already trained for 3 epochs, set "index_start=3" to continue naming checkpoints model_04, model_05 etc.) data_is_provided = False # Whether the preprocessed input data (subdirectories "images" and "labels" in both subdirectories "train" and "eval" of "dir_output") has already been generated (in the first epoch of a previous run). - if backbone_type == "transformer": - transformer_patchsize_x = None # Patch size of vision transformer patches in x direction. - transformer_patchsize_y = None # Patch size of vision transformer patches in y direction. - transformer_num_patches_xy = None # Number of patches for vision transformer in x and y direction respectively. - transformer_projection_dim = 64 # Transformer projection dimension. Default value is 64. - transformer_mlp_head_units = [128, 64] # Transformer Multilayer Perceptron (MLP) head units. Default value is [128, 64] - transformer_layers = 8 # transformer layers. Default value is 8. - transformer_num_heads = 4 # Transformer number of heads. Default value is 4. - transformer_cnn_first = True # We have two types of vision transformers: either the CNN is applied first, followed by the transformer, or reversed. -@ex.automain +@ex.main def run(_config, _log, task, @@ -187,27 +190,29 @@ def run(_config, n_batch, input_height, input_width, - is_loss_soft_dice, - weighted_loss, weight_decay, learning_rate, continue_training, - index_start, - dir_of_start_model, save_interval, augmentation, - thetha, - backbone_type, - transformer_projection_dim, - transformer_mlp_head_units, - transformer_layers, - transformer_num_heads, - transformer_cnn_first, - transformer_patchsize_x, - transformer_patchsize_y, - transformer_num_patches_xy, - f1_threshold_classification, - classification_classes_name, + # dependent config keys need a default, + # otherwise yields sacred.utils.ConfigAddedError + thetha=None, + is_loss_soft_dice=False, + weighted_loss=False, + index_start=0, + dir_of_start_model=None, + backbone_type=None, + transformer_projection_dim=None, + transformer_mlp_head_units=None, + transformer_layers=None, + transformer_num_heads=None, + transformer_cnn_first=None, + transformer_patchsize_x=None, + transformer_patchsize_y=None, + transformer_num_patches_xy=None, + f1_threshold_classification=None, + classification_classes_name=None, ): if pretraining and not os.path.isfile(RESNET50_WEIGHTS_PATH): From 5c7801a1d6273cd88b64548edf41507e5c0235d6 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 5 Feb 2026 11:56:11 +0100 Subject: [PATCH 63/91] training.train: simplify config args for model builder --- src/eynollah/training/models.py | 67 +++++++++++++++++++++++---------- src/eynollah/training/train.py | 33 ++++++++-------- 2 files changed, 63 insertions(+), 37 deletions(-) diff --git a/src/eynollah/training/models.py b/src/eynollah/training/models.py index 011c614..f053447 100644 --- a/src/eynollah/training/models.py +++ b/src/eynollah/training/models.py @@ -400,9 +400,21 @@ def resnet50_unet(n_classes, input_height=224, input_width=224, task="segmentati return model -def vit_resnet50_unet(n_classes, patch_size_x, patch_size_y, num_patches, mlp_head_units=None, transformer_layers=8, num_heads =4, projection_dim = 64, input_height=224, input_width=224, task="segmentation", weight_decay=1e-6, pretraining=False): - if mlp_head_units is None: - mlp_head_units = [128, 64] +def vit_resnet50_unet(num_patches, + n_classes, + transformer_patchsize_x, + transformer_patchsize_y, + transformer_mlp_head_units=None, + transformer_layers=8, + transformer_num_heads=4, + transformer_projection_dim=64, + input_height=224, + input_width=224, + task="segmentation", + weight_decay=1e-6, + pretraining=False): + if transformer_mlp_head_units is None: + transformer_mlp_head_units = [128, 64] inputs = layers.Input(shape=(input_height, input_width, 3)) #transformer_units = [ @@ -449,30 +461,30 @@ def vit_resnet50_unet(n_classes, patch_size_x, patch_size_y, num_patches, mlp_he #num_patches = x.shape[1]*x.shape[2] - #patch_size_y = input_height / x.shape[1] - #patch_size_x = input_width / x.shape[2] - #patch_size = patch_size_x * patch_size_y - patches = Patches(patch_size_x, patch_size_y)(x) + patches = Patches(transformer_patchsize_x, transformer_patchsize_y)(x) # Encode patches. - encoded_patches = PatchEncoder(num_patches, projection_dim)(patches) + encoded_patches = PatchEncoder(num_patches, transformer_projection_dim)(patches) for _ in range(transformer_layers): # Layer normalization 1. x1 = layers.LayerNormalization(epsilon=1e-6)(encoded_patches) # Create a multi-head attention layer. attention_output = layers.MultiHeadAttention( - num_heads=num_heads, key_dim=projection_dim, dropout=0.1 + num_heads=transformer_num_heads, key_dim=transformer_projection_dim, dropout=0.1 )(x1, x1) # Skip connection 1. x2 = layers.Add()([attention_output, encoded_patches]) # Layer normalization 2. x3 = layers.LayerNormalization(epsilon=1e-6)(x2) # MLP. - x3 = mlp(x3, hidden_units=mlp_head_units, dropout_rate=0.1) + x3 = mlp(x3, hidden_units=transformer_mlp_head_units, dropout_rate=0.1) # Skip connection 2. encoded_patches = layers.Add()([x3, x2]) - encoded_patches = tf.reshape(encoded_patches, [-1, x.shape[1], x.shape[2] , int( projection_dim / (patch_size_x * patch_size_y) )]) + encoded_patches = tf.reshape(encoded_patches, + [-1, x.shape[1], x.shape[2], + transformer_projection_dim // (transformer_patchsize_x * + transformer_patchsize_y)]) v1024_2048 = Conv2D( 1024 , (1, 1), padding='same', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay))(encoded_patches) v1024_2048 = (BatchNormalization(axis=bn_axis))(v1024_2048) @@ -524,9 +536,21 @@ def vit_resnet50_unet(n_classes, patch_size_x, patch_size_y, num_patches, mlp_he return model -def vit_resnet50_unet_transformer_before_cnn(n_classes, patch_size_x, patch_size_y, num_patches, mlp_head_units=None, transformer_layers=8, num_heads =4, projection_dim = 64, input_height=224, input_width=224, task="segmentation", weight_decay=1e-6, pretraining=False): - if mlp_head_units is None: - mlp_head_units = [128, 64] +def vit_resnet50_unet_transformer_before_cnn(num_patches, + n_classes, + transformer_patchsize_x, + transformer_patchsize_y, + transformer_mlp_head_units=None, + transformer_layers=8, + transformer_num_heads=4, + transformer_projection_dim=64, + input_height=224, + input_width=224, + task="segmentation", + weight_decay=1e-6, + pretraining=False): + if transformer_mlp_head_units is None: + transformer_mlp_head_units = [128, 64] inputs = layers.Input(shape=(input_height, input_width, 3)) ##transformer_units = [ @@ -536,27 +560,32 @@ def vit_resnet50_unet_transformer_before_cnn(n_classes, patch_size_x, patch_size IMAGE_ORDERING = 'channels_last' bn_axis=3 - patches = Patches(patch_size_x, patch_size_y)(inputs) + patches = Patches(transformer_patchsize_x, transformer_patchsize_y)(inputs) # Encode patches. - encoded_patches = PatchEncoder(num_patches, projection_dim)(patches) + encoded_patches = PatchEncoder(num_patches, transformer_projection_dim)(patches) for _ in range(transformer_layers): # Layer normalization 1. x1 = layers.LayerNormalization(epsilon=1e-6)(encoded_patches) # Create a multi-head attention layer. attention_output = layers.MultiHeadAttention( - num_heads=num_heads, key_dim=projection_dim, dropout=0.1 + num_heads=transformer_num_heads, key_dim=transformer_projection_dim, dropout=0.1 )(x1, x1) # Skip connection 1. x2 = layers.Add()([attention_output, encoded_patches]) # Layer normalization 2. x3 = layers.LayerNormalization(epsilon=1e-6)(x2) # MLP. - x3 = mlp(x3, hidden_units=mlp_head_units, dropout_rate=0.1) + x3 = mlp(x3, hidden_units=transformer_mlp_head_units, dropout_rate=0.1) # Skip connection 2. encoded_patches = layers.Add()([x3, x2]) - encoded_patches = tf.reshape(encoded_patches, [-1, input_height, input_width , int( projection_dim / (patch_size_x * patch_size_y) )]) + encoded_patches = tf.reshape(encoded_patches, + [-1, + input_height, + input_width, + transformer_projection_dim // (transformer_patchsize_x * + transformer_patchsize_y)]) encoded_patches = Conv2D(3, (1, 1), padding='same', data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay), name='convinput')(encoded_patches) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index a21a34d..4aafcf2 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -38,6 +38,7 @@ from tensorflow.keras.metrics import MeanIoU from tensorflow.keras.models import load_model from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard from sacred import Experiment +from sacred.config import create_captured_function from tqdm import tqdm from sklearn.metrics import f1_score @@ -318,7 +319,7 @@ def run(_config, task, weight_decay, pretraining) - elif backbone_type == 'transformer': + else: num_patches_x = transformer_num_patches_xy[0] num_patches_y = transformer_num_patches_xy[1] num_patches = num_patches_x * num_patches_y @@ -330,35 +331,31 @@ def run(_config, model_builder = vit_resnet50_unet_transformer_before_cnn multiple_of_32 = False - assert input_height == num_patches_y * transformer_patchsize_y * (32 if multiple_of_32 else 1), \ + assert input_height == (num_patches_y * + transformer_patchsize_y * + (32 if multiple_of_32 else 1)), \ "transformer_patchsize_y or transformer_num_patches_xy height value error: " \ "input_height should be equal to " \ "(transformer_num_patches_xy height value * transformer_patchsize_y%s)" % \ " * 32" if multiple_of_32 else "" - assert input_width == num_patches_x * transformer_patchsize_x * (32 if multiple_of_32 else 1), \ + assert input_width == (num_patches_x * + transformer_patchsize_x * + (32 if multiple_of_32 else 1)), \ "transformer_patchsize_x or transformer_num_patches_xy width value error: " \ "input_width should be equal to " \ "(transformer_num_patches_xy width value * transformer_patchsize_x%s)" % \ " * 32" if multiple_of_32 else "" - assert 0 == transformer_projection_dim % (transformer_patchsize_y * transformer_patchsize_x), \ + assert 0 == (transformer_projection_dim % + (transformer_patchsize_y * + transformer_patchsize_x)), \ "transformer_projection_dim error: " \ "The remainder when parameter transformer_projection_dim is divided by " \ "(transformer_patchsize_y*transformer_patchsize_x) should be zero" - model = model_builder( - n_classes, - transformer_patchsize_x, - transformer_patchsize_y, - num_patches, - transformer_mlp_head_units, - transformer_layers, - transformer_num_heads, - transformer_projection_dim, - input_height, - input_width, - task, - weight_decay, - pretraining) + model_builder = create_captured_function(model_builder) + model_builder.config = _config + model_builder.logger = _log + model = model_builder(num_patches) #if you want to see the model structure just uncomment model summary. #model.summary() From 82d649061a7d932df25828081c01b25a6acae012 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 5 Feb 2026 11:57:38 +0100 Subject: [PATCH 64/91] training.train: fix F1 metric score setup --- src/eynollah/training/train.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index 4aafcf2..effc920 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -34,7 +34,7 @@ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' os.environ['TF_USE_LEGACY_KERAS'] = '1' # avoid Keras 3 after TF 2.15 import tensorflow as tf from tensorflow.keras.optimizers import SGD, Adam -from tensorflow.keras.metrics import MeanIoU +from tensorflow.keras.metrics import MeanIoU, F1Score from tensorflow.keras.models import load_model from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard from sacred import Experiment @@ -427,8 +427,8 @@ def run(_config, model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), # rs: why not learning_rate? - metrics=['accuracy']) - + metrics=['accuracy', F1Score(average='macro', name='f1')]) + list_classes = list(classification_classes_name.values()) trainXY = generate_data_from_folder_training( dir_train, n_batch, input_height, input_width, n_classes, list_classes) @@ -440,7 +440,8 @@ def run(_config, callbacks = [TensorBoard(os.path.join(dir_output, 'logs'), write_graph=False), SaveWeightsAfterSteps(0, dir_output, _config, monitor='val_f1', - save_best_only=True, mode='max')] + #save_best_only=True, # we need all for ensembling + mode='max')] history = model.fit(trainXY, steps_per_epoch=num_rows / n_batch, @@ -448,17 +449,17 @@ def run(_config, validation_data=testXY, verbose=1, epochs=n_epochs, - metrics=[F1Score(average='macro', name='f1')], callbacks=callbacks, initial_epoch=index_start) - usable_checkpoints = np.flatnonzero(np.array(history['val_f1']) > f1_threshold_classification) + usable_checkpoints = np.flatnonzero(np.array(history.history['val_f1']) > + f1_threshold_classification) if len(usable_checkpoints) >= 1: _log.info("averaging over usable checkpoints: %s", str(usable_checkpoints)) all_weights = [] for epoch in usable_checkpoints: - cp_path = os.path.join(dir_output, 'model_{epoch:02d}'.format(epoch=epoch)) - assert os.path.isdir(cp_path) + cp_path = os.path.join(dir_output, 'model_{epoch:02d}'.format(epoch=epoch + 1)) + assert os.path.isdir(cp_path), cp_path model = load_model(cp_path, compile=False) all_weights.append(model.get_weights()) From f03124f747db7edef03d968e1b10db0e7638850d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 5 Feb 2026 11:58:50 +0100 Subject: [PATCH 65/91] =?UTF-8?q?training.train:=20simplify+fix=20classifi?= =?UTF-8?q?cation=20data=20loaders=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - unify `generate_data_from_folder_training` w/ `..._evaluation` - instead of recreating array after every batch, just zero out - cast image results to uint8 instead of uint16 - cast categorical results to float instead of int --- src/eynollah/training/train.py | 15 ++++--- src/eynollah/training/utils.py | 78 ++++++++-------------------------- 2 files changed, 25 insertions(+), 68 deletions(-) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index effc920..0f8d0e9 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -430,13 +430,13 @@ def run(_config, metrics=['accuracy', F1Score(average='macro', name='f1')]) list_classes = list(classification_classes_name.values()) - trainXY = generate_data_from_folder_training( - dir_train, n_batch, input_height, input_width, n_classes, list_classes) - testXY = generate_data_from_folder_evaluation( - dir_eval, input_height, input_width, n_classes, list_classes) + trainXY = generate_data_from_folder( + dir_train, n_batch, input_height, input_width, n_classes, list_classes, shuffle=True) + testXY = generate_data_from_folder( + dir_eval, n_batch, input_height, input_width, n_classes, list_classes) + epoch_size_train = return_number_of_total_training_data(dir_train) + epoch_size_eval = return_number_of_total_training_data(dir_eval) - y_tot = np.zeros((testX.shape[0], n_classes)) - num_rows = return_number_of_total_training_data(dir_train) callbacks = [TensorBoard(os.path.join(dir_output, 'logs'), write_graph=False), SaveWeightsAfterSteps(0, dir_output, _config, monitor='val_f1', @@ -444,9 +444,10 @@ def run(_config, mode='max')] history = model.fit(trainXY, - steps_per_epoch=num_rows / n_batch, + steps_per_epoch=epoch_size_train // n_batch, #class_weight=weights) validation_data=testXY, + validation_steps=epoch_size_eval // n_batch, verbose=1, epochs=n_epochs, callbacks=callbacks, diff --git a/src/eynollah/training/utils.py b/src/eynollah/training/utils.py index 61b2536..5b25a4f 100644 --- a/src/eynollah/training/utils.py +++ b/src/eynollah/training/utils.py @@ -166,50 +166,7 @@ def return_number_of_total_training_data(path_classes): -def generate_data_from_folder_evaluation(path_classes, height, width, n_classes, list_classes): - #sub_classes = os.listdir(path_classes) - #n_classes = len(sub_classes) - all_imgs = [] - labels = [] - #dicts =dict() - #indexer= 0 - for indexer, sub_c in enumerate(list_classes): - sub_files = os.listdir(os.path.join(path_classes,sub_c )) - sub_files = [os.path.join(path_classes,sub_c )+'/' + x for x in sub_files] - #print( os.listdir(os.path.join(path_classes,sub_c )) ) - all_imgs = all_imgs + sub_files - sub_labels = list( np.zeros( len(sub_files) ) +indexer ) - - #print( len(sub_labels) ) - labels = labels + sub_labels - #dicts[sub_c] = indexer - #indexer +=1 - - - categories = to_categorical(range(n_classes)).astype(np.int16)#[ [1 , 0, 0 , 0 , 0 , 0] , [0 , 1, 0 , 0 , 0 , 0] , [0 , 0, 1 , 0 , 0 , 0] , [0 , 0, 0 , 1 , 0 , 0] , [0 , 0, 0 , 0 , 1 , 0] , [0 , 0, 0 , 0 , 0 , 1] ] - ret_x= np.zeros((len(labels), height,width, 3)).astype(np.int16) - ret_y= np.zeros((len(labels), n_classes)).astype(np.int16) - - #print(all_imgs) - for i in range(len(all_imgs)): - row = all_imgs[i] - #####img = cv2.imread(row, 0) - #####img= resize_image (img, height, width) - #####img = img.astype(np.uint16) - #####ret_x[i, :,:,0] = img[:,:] - #####ret_x[i, :,:,1] = img[:,:] - #####ret_x[i, :,:,2] = img[:,:] - - img = cv2.imread(row) - img= resize_image (img, height, width) - img = img.astype(np.uint16) - ret_x[i, :,:] = img[:,:,:] - - ret_y[i, :] = categories[ int( labels[i] ) ][:] - - return ret_x/255., ret_y - -def generate_data_from_folder_training(path_classes, batchsize, height, width, n_classes, list_classes): +def generate_data_from_folder(path_classes, batchsize, height, width, n_classes, list_classes, shuffle=False): #sub_classes = os.listdir(path_classes) #n_classes = len(sub_classes) @@ -228,43 +185,42 @@ def generate_data_from_folder_training(path_classes, batchsize, height, width, n labels = labels + sub_labels #dicts[sub_c] = indexer #indexer +=1 - - ids = np.array(range(len(labels))) - random.shuffle(ids) - - shuffled_labels = np.array(labels)[ids] - shuffled_files = np.array(all_imgs)[ids] + + if shuffle: + ids = np.array(range(len(labels))) + random.shuffle(ids) + labels = np.array(labels)[ids] + all_imgs = np.array(all_imgs)[ids] + categories = to_categorical(range(n_classes)).astype(np.int16)#[ [1 , 0, 0 , 0 , 0 , 0] , [0 , 1, 0 , 0 , 0 , 0] , [0 , 0, 1 , 0 , 0 , 0] , [0 , 0, 0 , 1 , 0 , 0] , [0 , 0, 0 , 0 , 1 , 0] , [0 , 0, 0 , 0 , 0 , 1] ] - ret_x= np.zeros((batchsize, height,width, 3)).astype(np.int16) - ret_y= np.zeros((batchsize, n_classes)).astype(np.int16) + ret_x= np.zeros((batchsize, height,width, 3)).astype(np.uint8) + ret_y= np.zeros((batchsize, n_classes)).astype(float) batchcount = 0 while True: - for i in range(len(shuffled_files)): - row = shuffled_files[i] - #print(row) - ###img = cv2.imread(row, 0) + for lab, img in zip(labels, all_imgs): + ###img = cv2.imread(img, 0) ###img= resize_image (img, height, width) ###img = img.astype(np.uint16) ###ret_x[batchcount, :,:,0] = img[:,:] ###ret_x[batchcount, :,:,1] = img[:,:] ###ret_x[batchcount, :,:,2] = img[:,:] - img = cv2.imread(row) + img = cv2.imread(img) img= resize_image (img, height, width) img = img.astype(np.uint16) ret_x[batchcount, :,:,:] = img[:,:,:] #print(int(shuffled_labels[i]) ) #print( categories[int(shuffled_labels[i])] ) - ret_y[batchcount, :] = categories[ int( shuffled_labels[i] ) ][:] + ret_y[batchcount, :] = categories[int(lab)][:] batchcount+=1 if batchcount>=batchsize: - ret_x = ret_x/255. + ret_x = ret_x//255 yield ret_x, ret_y - ret_x= np.zeros((batchsize, height,width, 3)).astype(np.int16) - ret_y= np.zeros((batchsize, n_classes)).astype(np.int16) + ret_x[:] = 0 + ret_y[:] = 0 batchcount = 0 def do_brightening(img, factor): From 5d0c26b629dc0f7368c7d2058a2efbd0ac27a911 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 5 Feb 2026 12:02:58 +0100 Subject: [PATCH 66/91] training.train: use std Keras data loader for classification (much more efficient, works with std F1 metric) --- src/eynollah/training/train.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index 0f8d0e9..7cf7536 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -23,8 +23,6 @@ from eynollah.training.models import ( from eynollah.training.utils import ( data_gen, generate_arrays_from_folder_reading_order, - generate_data_from_folder_evaluation, - generate_data_from_folder_training, get_one_hot, preprocess_imgs, return_number_of_total_training_data @@ -37,6 +35,7 @@ from tensorflow.keras.optimizers import SGD, Adam from tensorflow.keras.metrics import MeanIoU, F1Score from tensorflow.keras.models import load_model from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard +from tensorflow.keras.utils import image_dataset_from_directory from sacred import Experiment from sacred.config import create_captured_function from tqdm import tqdm @@ -430,13 +429,13 @@ def run(_config, metrics=['accuracy', F1Score(average='macro', name='f1')]) list_classes = list(classification_classes_name.values()) - trainXY = generate_data_from_folder( - dir_train, n_batch, input_height, input_width, n_classes, list_classes, shuffle=True) - testXY = generate_data_from_folder( - dir_eval, n_batch, input_height, input_width, n_classes, list_classes) - epoch_size_train = return_number_of_total_training_data(dir_train) - epoch_size_eval = return_number_of_total_training_data(dir_eval) - + data_args = dict(label_mode="categorical", + class_names=list_classes, + batch_size=n_batch, + image_size=(input_height, input_width), + interpolation="nearest") + trainXY = image_dataset_from_directory(dir_train, shuffle=True, **data_args) + testXY = image_dataset_from_directory(dir_eval, shuffle=False, **data_args) callbacks = [TensorBoard(os.path.join(dir_output, 'logs'), write_graph=False), SaveWeightsAfterSteps(0, dir_output, _config, monitor='val_f1', @@ -444,10 +443,8 @@ def run(_config, mode='max')] history = model.fit(trainXY, - steps_per_epoch=epoch_size_train // n_batch, #class_weight=weights) validation_data=testXY, - validation_steps=epoch_size_eval // n_batch, verbose=1, epochs=n_epochs, callbacks=callbacks, From b1633dfc7cf9cdfd84586b2fe367a8bd239fc2cf Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 5 Feb 2026 14:53:26 +0100 Subject: [PATCH 67/91] training.generate_gt: for RO, skip files if regionRefs are missing --- .../training/generate_gt_for_training.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/eynollah/training/generate_gt_for_training.py b/src/eynollah/training/generate_gt_for_training.py index 693cab8..f71614c 100644 --- a/src/eynollah/training/generate_gt_for_training.py +++ b/src/eynollah/training/generate_gt_for_training.py @@ -205,14 +205,20 @@ def machine_based_reading_order(dir_xml, dir_out_modal_image, dir_out_classes, i img_header_and_sep = np.zeros((y_len,x_len), dtype='uint8') for j in range(len(cy_main)): - img_header_and_sep[int(y_max_main[j]):int(y_max_main[j])+12,int(x_min_main[j]):int(x_max_main[j]) ] = 1 + img_header_and_sep[int(y_max_main[j]):int(y_max_main[j])+12, + int(x_min_main[j]):int(x_max_main[j]) ] = 1 - texts_corr_order_index = [index_tot_regions[tot_region_ref.index(i)] for i in id_all_text ] - texts_corr_order_index_int = [int(x) for x in texts_corr_order_index] - + try: + texts_corr_order_index_int = [int(index_tot_regions[tot_region_ref.index(i)]) + for i in id_all_text] + except ValueError as e: + print("incomplete ReadingOrder in", xml_file, "- skipping:", str(e)) + continue - co_text_all, texts_corr_order_index_int, regions_ar_less_than_early_min = filter_contours_area_of_image(img_poly, co_text_all, texts_corr_order_index_int, max_area, min_area, min_area_early) + co_text_all, texts_corr_order_index_int, regions_ar_less_than_early_min = \ + filter_contours_area_of_image(img_poly, co_text_all, texts_corr_order_index_int, + max_area, min_area, min_area_early) arg_array = np.array(range(len(texts_corr_order_index_int))) From 0d3a8eacba67f6fc6b8bec0fbe6ea12d4d1b948f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 5 Feb 2026 14:54:08 +0100 Subject: [PATCH 68/91] improve/update docs/train.md --- docs/train.md | 110 +++++++++++++++++++++++++++++--------------------- 1 file changed, 64 insertions(+), 46 deletions(-) diff --git a/docs/train.md b/docs/train.md index 4e76740..3c64ab9 100644 --- a/docs/train.md +++ b/docs/train.md @@ -343,51 +343,17 @@ The following parameter configuration can be applied to all segmentation use cas its sub-parameters, and continued training are defined only for segmentation use cases and enhancements, not for classification and machine-based reading order, as you can see in their example config files. -* `backbone_type`: For segmentation tasks (such as text line, binarization, and layout detection) and enhancement, we - offer two backbone options: a "nontransformer" and a "transformer" backbone. For the "transformer" backbone, we first - apply a CNN followed by a transformer. In contrast, the "nontransformer" backbone utilizes only a CNN ResNet-50. -* `task`: The task parameter can have values such as "segmentation", "enhancement", "classification", and "reading_order". -* `patches`: If you want to break input images into smaller patches (input size of the model) you need to set this -* parameter to `true`. In the case that the model should see the image once, like page extraction, patches should be - set to ``false``. -* `n_batch`: Number of batches at each iteration. -* `n_classes`: Number of classes. In the case of binary classification this should be 2. In the case of reading_order it - should set to 1. And for the case of layout detection just the unique number of classes should be given. -* `n_epochs`: Number of epochs. -* `input_height`: This indicates the height of model's input. -* `input_width`: This indicates the width of model's input. -* `weight_decay`: Weight decay of l2 regularization of model layers. -* `pretraining`: Set to `true` to load pretrained weights of ResNet50 encoder. The downloaded weights should be saved - in a folder named "pretrained_model" in the same directory of "train.py" script. -* `augmentation`: If you want to apply any kind of augmentation this parameter should first set to `true`. -* `flip_aug`: If `true`, different types of filp will be applied on image. Type of flips is given with "flip_index" parameter. -* `blur_aug`: If `true`, different types of blurring will be applied on image. Type of blurrings is given with "blur_k" parameter. -* `scaling`: If `true`, scaling will be applied on image. Scale of scaling is given with "scales" parameter. -* `degrading`: If `true`, degrading will be applied to the image. The amount of degrading is defined with "degrade_scales" parameter. -* `brightening`: If `true`, brightening will be applied to the image. The amount of brightening is defined with "brightness" parameter. -* `rotation_not_90`: If `true`, rotation (not 90 degree) will be applied on image. Rotation angles are given with "thetha" parameter. -* `rotation`: If `true`, 90 degree rotation will be applied on image. -* `binarization`: If `true`,Otsu thresholding will be applied to augment the input data with binarized images. -* `scaling_bluring`: If `true`, combination of scaling and blurring will be applied on image. -* `scaling_binarization`: If `true`, combination of scaling and binarization will be applied on image. -* `scaling_flip`: If `true`, combination of scaling and flip will be applied on image. -* `flip_index`: Type of flips. -* `blur_k`: Type of blurrings. -* `scales`: Scales of scaling. -* `brightness`: The amount of brightenings. -* `thetha`: Rotation angles. -* `degrade_scales`: The amount of degradings. -* `continue_training`: If `true`, it means that you have already trained a model and you would like to continue the - training. So it is needed to providethe dir of trained model with "dir_of_start_model" and index for naming - themodels. For example if you have already trained for 3 epochs then your lastindex is 2 and if you want to continue - from model_1.h5, you can set `index_start` to 3 to start naming model with index 3. -* `weighted_loss`: If `true`, this means that you want to apply weighted categorical_crossentropy as loss fucntion. Be carefull if you set to `true`the parameter "is_loss_soft_dice" should be ``false`` -* `data_is_provided`: If you have already provided the input data you can set this to `true`. Be sure that the train - and eval data are in"dir_output".Since when once we provide training data we resize and augmentthem and then wewrite - them in sub-directories train and eval in "dir_output". -* `dir_train`: This is the directory of "images" and "labels" (dir_train should include two subdirectories with names of images and labels ) for raw images and labels. Namely they are not prepared (not resized and not augmented) yet for training the model. When we run this tool these raw data will be transformed to suitable size needed for the model and they will be written in "dir_output" in train and eval directories. Each of train and eval include "images" and "labels" sub-directories. -* `index_start`: Starting index for saved models in the case that "continue_training" is `true`. -* `dir_of_start_model`: Directory containing pretrained model to continue training the model in the case that "continue_training" is `true`. +* `task`: The task parameter must be one of the following values: + - `binarization`, + - `enhancement`, + - `segmentation`, + - `classification`, + - `reading_order`. +* `backbone_type`: For the tasks `segmentation` (such as text line, and region layout detection), + `binarization` and `enhancement`, we offer two backbone options: + - `nontransformer` (only a CNN ResNet-50). + - `transformer` (first apply a CNN, followed by a transformer) +* `transformer_cnn_first`: Whether to apply the CNN first (followed by the transformer) when using `transformer` backbone. * `transformer_num_patches_xy`: Number of patches for vision transformer in x and y direction respectively. * `transformer_patchsize_x`: Patch size of vision transformer patches in x direction. * `transformer_patchsize_y`: Patch size of vision transformer patches in y direction. @@ -395,7 +361,59 @@ classification and machine-based reading order, as you can see in their example * `transformer_mlp_head_units`: Transformer Multilayer Perceptron (MLP) head units. Default value is [128, 64]. * `transformer_layers`: transformer layers. Default value is 8. * `transformer_num_heads`: Transformer number of heads. Default value is 4. -* `transformer_cnn_first`: We have two types of vision transformers. In one type, a CNN is applied first, followed by a transformer. In the other type, this order is reversed. If transformer_cnn_first is true, it means the CNN will be applied before the transformer. Default value is true. +* `patches`: Whether to break up (tile) input images into smaller patches (input size of the model). + If `false`, the model will see the image once (resized to the input size of the model). + Should be set to `false` for cases like page extraction. +* `n_batch`: Number of batches at each iteration. +* `n_classes`: Number of classes. In the case of binary classification this should be 2. In the case of reading_order it + should set to 1. And for the case of layout detection just the unique number of classes should be given. +* `n_epochs`: Number of epochs (iterations over the data) to train. +* `input_height`: the image height for the model's input. +* `input_width`: the image width for the model's input. +* `weight_decay`: Weight decay of l2 regularization of model layers. +* `weighted_loss`: If `true`, this means that you want to apply weighted categorical crossentropy as loss function. + (Mutually exclusive with `is_loss_soft_dice`, and only applies for `segmentation` and `binarization` tasks.) +* `pretraining`: Set to `true` to (download and) initialise pretrained weights of ResNet50 encoder. +* `dir_train`: Path to directory of raw training data (as extracted via `pagexml2labels`, i.e. with subdirectories + `images` and `labels` for input images and output labels. + (These are not prepared for training the model, yet. Upon first run, the raw data will be transformed to suitable size + needed for the model, and written in `dir_output` under `train` and `eval` subdirectories. See `data_is_provided`.) +* `dir_eval`: Ditto for raw evaluation data. +* `dir_output`: Directory to write model checkpoints, logs (for Tensorboard) and precomputed images to. +* `data_is_provided`: If you have already trained at least one complete epoch (using the same data settings) before, + you can set this to `true` to avoid computing the resized / patched / augmented image files again. + Be sure that there are subdirectories `train` and `eval` data are in `dir_output` (each with subdirectories `images` + and `labels`, respectively). +* `continue_training`: If `true`, continue training a model checkpoint from a previous run. + This requires providing the directory of the model checkpoint to load via `dir_of_start_model` + and setting `index_start` counter for naming new checkpoints. + For example if you have already trained for 3 epochs, then your last index is 2, so if you want + to continue with `model_04`, `model_05` etc., set `index_start=3`. +* `index_start`: Starting index for saving models in the case that `continue_training` is `true`. + (Existing checkpoints above this will be overwritten.) +* `dir_of_start_model`: Directory containing existing model checkpoint to initialise model weights from when `continue_training=true`. + (Can be an epoch-interval checkpoint, or batch-interval checkpoint from `save_interval`.) +* `augmentation`: If you want to apply any kind of augmentation this parameter should first set to `true`. + The remaining settings pertain to that... +* `flip_aug`: If `true`, different types of flipping over the image arrays. Requires `flip_index` parameter. +* `flip_index`: List of flip codes (as in `cv2.flip`, i.e. 0 for vertical, positive for horizontal shift, negative for vertical and horizontal shift). +* `blur_aug`: If `true`, different types of blurring will be applied on image. Requires `blur_k` parameter. +* `blur_k`: Method of blurring (`gauss`, `median` or `blur`). +* `scaling`: If `true`, scaling will be applied on image. Requires `scales` parameter. +* `scales`: List of scale factors for scaling. +* `scaling_bluring`: If `true`, combination of scaling and blurring will be applied on image. +* `scaling_binarization`: If `true`, combination of scaling and binarization will be applied on image. +* `scaling_flip`: If `true`, combination of scaling and flip will be applied on image. +* `degrading`: If `true`, degrading will be applied to the image. Requires `degrade_scales` parameter. +* `degrade_scales`: List of intensity factors for degrading. +* `brightening`: If `true`, brightening will be applied to the image. Requires `brightness` parameter. +* `brightness`: List of intensity factors for brightening. +* `binarization`: If `true`, Otsu thresholding will be applied to augment the input data with binarized images. +* `dir_img_bin`: With `binarization`, use this directory to read precomputed binarized images instead of ad-hoc Otsu. + (Base names should correspond to the files in `dir_train/images`.) +* `rotation`: If `true`, 90° rotation will be applied on images. +* `rotation_not_90`: If `true`, random rotation (other than 90°) will be applied on image. Requires `thetha` parameter. +* `thetha`: List of rotation angles (in degrees). In case of segmentation and enhancement the train and evaluation data should be organised as follows. From 6944d3161717bbe1a821ba50658fcf6aae4ba9ac Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 5 Feb 2026 17:58:32 +0100 Subject: [PATCH 69/91] =?UTF-8?q?modify=20manual=20RO=20preference?= =?UTF-8?q?=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit in `return_boxes_of_images_by_order_of_reading_new`, when the next multicol separator ends in the same column, do not recurse into subspan if the next starts earlier (but continue with top span to the right first) --- src/eynollah/utils/__init__.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 4e55aef..b839385 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1881,7 +1881,10 @@ def return_boxes_of_images_by_order_of_reading_new( y_mid[nxt]]) # dbg_plt(boxes[-1], "recursive column %d:%d box [%d]" % (column, last, len(boxes))) column = last - if last == x_ending[nxt] and x_ending[nxt] <= x_ending[cur] and nxt in args: + if (last == x_ending[nxt] and + x_ending[nxt] <= x_ending[cur] and + x_starting[nxt] >= x_starting[cur] and + nxt in args): # child – recur # print("recur", nxt, y_mid[nxt], "%d:%d" % (x_starting[nxt], x_ending[nxt])) args.remove(nxt) From bd282a594d7dac9adcbcce55b09fbd1e1a7f85a9 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 7 Feb 2026 16:34:55 +0100 Subject: [PATCH 70/91] training follow-up: - use relative imports - use tf.keras everywhere (and ensure v2) - `weights_ensembling`: * use `Patches` and `PatchEncoder` from .models * drop TF1 stuff * make function / CLI more flexible (expect list of checkpoint dirs instead of single top-level directory) - train for `classification`: delegate to `weights_ensembling.run_ensembling` --- src/eynollah/eynollah_imports.py | 3 + src/eynollah/training/cli.py | 2 +- .../training/generate_gt_for_training.py | 14 +- src/eynollah/training/inference.py | 4 +- src/eynollah/training/train.py | 116 ++++++------- src/eynollah/training/weights_ensembling.py | 156 +++++------------- 6 files changed, 112 insertions(+), 183 deletions(-) diff --git a/src/eynollah/eynollah_imports.py b/src/eynollah/eynollah_imports.py index f04cfdc..496406c 100644 --- a/src/eynollah/eynollah_imports.py +++ b/src/eynollah/eynollah_imports.py @@ -1,6 +1,9 @@ """ Load libraries with possible race conditions once. This must be imported as the first module of eynollah. """ +import os +os.environ['TF_USE_LEGACY_KERAS'] = '1' # avoid Keras 3 after TF 2.15 + from ocrd_utils import tf_disable_interactive_logs from torch import * tf_disable_interactive_logs() diff --git a/src/eynollah/training/cli.py b/src/eynollah/training/cli.py index 3718275..ae14f04 100644 --- a/src/eynollah/training/cli.py +++ b/src/eynollah/training/cli.py @@ -9,7 +9,7 @@ from .generate_gt_for_training import main as generate_gt_cli from .inference import main as inference_cli from .train import ex from .extract_line_gt import linegt_cli -from .weights_ensembling import main as ensemble_cli +from .weights_ensembling import ensemble_cli @click.command(context_settings=dict( ignore_unknown_options=True, diff --git a/src/eynollah/training/generate_gt_for_training.py b/src/eynollah/training/generate_gt_for_training.py index 2c076d3..2422cc2 100644 --- a/src/eynollah/training/generate_gt_for_training.py +++ b/src/eynollah/training/generate_gt_for_training.py @@ -7,7 +7,7 @@ from PIL import Image, ImageDraw, ImageFont import cv2 import numpy as np -from eynollah.training.gt_gen_utils import ( +from .gt_gen_utils import ( filter_contours_area_of_image, find_format_of_given_filename_in_dir, find_new_features_of_contours, @@ -26,6 +26,9 @@ from eynollah.training.gt_gen_utils import ( @click.group() def main(): + """ + extract GT data suitable for model training for various tasks + """ pass @main.command() @@ -74,6 +77,9 @@ def main(): ) def pagexml2label(dir_xml,dir_out,type_output,config, printspace, dir_images, dir_out_images): + """ + extract PAGE-XML GT data suitable for model training for segmentation tasks + """ if config: with open(config) as f: config_params = json.load(f) @@ -110,6 +116,9 @@ def pagexml2label(dir_xml,dir_out,type_output,config, printspace, dir_images, di type=click.Path(exists=True, dir_okay=False), ) def image_enhancement(dir_imgs, dir_out_images, dir_out_labels, scales): + """ + extract image GT data suitable for model training for image enhancement tasks + """ ls_imgs = os.listdir(dir_imgs) with open(scales) as f: scale_dict = json.load(f) @@ -175,6 +184,9 @@ def image_enhancement(dir_imgs, dir_out_images, dir_out_labels, scales): ) def machine_based_reading_order(dir_xml, dir_out_modal_image, dir_out_classes, input_height, input_width, min_area_size, min_area_early): + """ + extract PAGE-XML GT data suitable for model training for reading-order task + """ xml_files_ind = os.listdir(dir_xml) xml_files_ind = [ind_xml for ind_xml in xml_files_ind if ind_xml.endswith('.xml')] input_height = int(input_height) diff --git a/src/eynollah/training/inference.py b/src/eynollah/training/inference.py index 454c689..2b26210 100644 --- a/src/eynollah/training/inference.py +++ b/src/eynollah/training/inference.py @@ -33,9 +33,9 @@ from .metrics import ( soft_dice_loss, weighted_categorical_crossentropy, ) +from.utils import scale_padd_image_for_ocr +from ..utils.utils_ocr import decode_batch_predictions -from.utils import (scale_padd_image_for_ocr) -from eynollah.utils.utils_ocr import (decode_batch_predictions) with warnings.catch_warnings(): warnings.simplefilter("ignore") diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index 61dbdf7..217ab35 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -3,32 +3,8 @@ import sys import json import requests -import click -from eynollah.training.metrics import ( - soft_dice_loss, - weighted_categorical_crossentropy -) -from eynollah.training.models import ( - PatchEncoder, - Patches, - machine_based_reading_order_model, - resnet50_classifier, - resnet50_unet, - vit_resnet50_unet, - vit_resnet50_unet_transformer_before_cnn, - cnn_rnn_ocr_model, - RESNET50_WEIGHTS_PATH, - RESNET50_WEIGHTS_URL -) -from eynollah.training.utils import ( - data_gen, - generate_arrays_from_folder_reading_order, - get_one_hot, - preprocess_imgs, -) - -os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' os.environ['TF_USE_LEGACY_KERAS'] = '1' # avoid Keras 3 after TF 2.15 import tensorflow as tf from tensorflow.keras.optimizers import SGD, Adam @@ -43,6 +19,31 @@ from sacred.config import create_captured_function import numpy as np import cv2 +from .metrics import ( + soft_dice_loss, + weighted_categorical_crossentropy +) +from .models import ( + PatchEncoder, + Patches, + machine_based_reading_order_model, + resnet50_classifier, + resnet50_unet, + vit_resnet50_unet, + vit_resnet50_unet_transformer_before_cnn, + cnn_rnn_ocr_model, + RESNET50_WEIGHTS_PATH, + RESNET50_WEIGHTS_URL +) +from .utils import ( + data_gen, + generate_arrays_from_folder_reading_order, + get_one_hot, + preprocess_imgs, +) +from .weights_ensembling import run_ensembling + + class SaveWeightsAfterSteps(ModelCheckpoint): def __init__(self, save_interval, save_path, _config, **kwargs): if save_interval: @@ -65,9 +66,7 @@ class SaveWeightsAfterSteps(ModelCheckpoint): super()._save_handler(filepath) with open(os.path.join(filepath, "config.json"), "w") as fp: json.dump(self._config, fp) # encode dict into JSON - - - + def configuration(): try: for device in tf.config.list_physical_devices('GPU'): @@ -272,6 +271,9 @@ def run(_config, skewing_amplitudes=None, max_len=None, ): + """ + run configured experiment via sacred + """ if pretraining and not os.path.isfile(RESNET50_WEIGHTS_PATH): _log.info("downloading RESNET50 pretrained weights to %s", RESNET50_WEIGHTS_PATH) @@ -312,7 +314,7 @@ def run(_config, imgs_list = list(os.listdir(dir_img)) segs_list = list(os.listdir(dir_seg)) - + imgs_list_test = list(os.listdir(dir_img_val)) segs_list_test = list(os.listdir(dir_seg_val)) @@ -380,7 +382,7 @@ def run(_config, num_patches_x = transformer_num_patches_xy[0] num_patches_y = transformer_num_patches_xy[1] num_patches = num_patches_x * num_patches_y - + if transformer_cnn_first: model_builder = vit_resnet50_unet multiple_of_32 = True @@ -413,13 +415,13 @@ def run(_config, model_builder.config = _config model_builder.logger = _log model = model_builder(num_patches) - + assert model is not None #if you want to see the model structure just uncomment model summary. #model.summary() - + if task in ["segmentation", "binarization"]: - if is_loss_soft_dice: + if is_loss_soft_dice: loss = soft_dice_loss elif weighted_loss: loss = weighted_categorical_crossentropy(weights) @@ -434,7 +436,7 @@ def run(_config, ignore_class=0, sparse_y_true=False, sparse_y_pred=False)]) - + # generating train and evaluation data gen_kwargs = dict(batch_size=n_batch, input_height=input_height, @@ -447,7 +449,7 @@ def run(_config, ##img_validation_patches = os.listdir(dir_flow_eval_imgs) ##score_best=[] ##score_best.append(0) - + callbacks = [TensorBoard(os.path.join(dir_output, 'logs'), write_graph=False), SaveWeightsAfterSteps(0, dir_output, _config)] if save_interval: @@ -471,7 +473,7 @@ def run(_config, #os.system('rm -rf '+dir_eval_flowing) #model.save(dir_output+'/'+'model'+'.h5') - + elif task=="cnn-rnn-ocr": dir_img, dir_lab = get_dirs_or_files(dir_train) @@ -480,7 +482,7 @@ def run(_config, labs_list = list(os.listdir(dir_lab)) imgs_list_val = list(os.listdir(dir_img_val)) labs_list_val = list(os.listdir(dir_lab_val)) - + with open(characters_txt_file, 'r') as char_txt_f: characters = json.load(char_txt_f) padding_token = len(characters) + 5 @@ -533,7 +535,7 @@ def run(_config, #tf.keras.optimizers.schedules.CosineDecay(initial_learning_rate, decay_steps, alpha) opt = tf.keras.optimizers.Adam(learning_rate=learning_rate) model.compile(optimizer=opt) # rs: loss seems to be (ctc_batch_cost) in last layer - + callbacks = [TensorBoard(os.path.join(dir_output, 'logs'), write_graph=False), SaveWeightsAfterSteps(0, dir_output, _config)] if save_interval: @@ -544,7 +546,7 @@ def run(_config, epochs=n_epochs, callbacks=callbacks, initial_epoch=index_start) - + elif task=='classification': if continue_training: model = load_model(dir_of_start_model, compile=False) @@ -573,7 +575,7 @@ def run(_config, monitor='val_f1', #save_best_only=True, # we need all for ensembling mode='max')] - + history = model.fit(trainXY, #class_weight=weights) validation_data=testXY, @@ -586,28 +588,12 @@ def run(_config, f1_threshold_classification) if len(usable_checkpoints) >= 1: _log.info("averaging over usable checkpoints: %s", str(usable_checkpoints)) - all_weights = [] - for epoch in usable_checkpoints: - cp_path = os.path.join(dir_output, 'model_{epoch:02d}'.format(epoch=epoch + 1)) - assert os.path.isdir(cp_path), cp_path - model = load_model(cp_path, compile=False) - all_weights.append(model.get_weights()) + usable_checkpoints = [os.path.join(dir_output, 'model_{epoch:02d}'.format(epoch=epoch + 1)) + for epoch in usable_checkpoints] + ens_path = os.path.join(dir_output, 'model_ens_avg') + run_ensembling(usable_checkpoints, ens_path) + _log.info("ensemble model saved under '%s'", ens_path) - new_weights = [] - for layer_weights in zip(*all_weights): - layer_weights = np.array([np.array(weights).mean(axis=0) - for weights in zip(*layer_weights)]) - new_weights.append(layer_weights) - - #model = tf.keras.models.clone_model(model) - model.set_weights(new_weights) - - cp_path = os.path.join(dir_output, 'model_ens_avg') - model.save(cp_path) - with open(os.path.join(cp_path, "config.json"), "w") as fp: - json.dump(_config, fp) # encode dict into JSON - _log.info("ensemble model saved under '%s'", cp_path) - elif task=='reading_order': if continue_training: model = load_model(dir_of_start_model, compile=False) @@ -618,10 +604,10 @@ def run(_config, input_width, weight_decay, pretraining) - + dir_flow_train_imgs = os.path.join(dir_train, 'images') dir_flow_train_labels = os.path.join(dir_train, 'labels') - + classes = os.listdir(dir_flow_train_labels) if augmentation: num_rows = len(classes)*(len(thetha) + 1) @@ -634,7 +620,7 @@ def run(_config, #optimizer=SGD(learning_rate=0.01, momentum=0.9), optimizer=Adam(learning_rate=0.0001), # rs: why not learning_rate? metrics=['accuracy']) - + callbacks = [TensorBoard(os.path.join(dir_output, 'logs'), write_graph=False), SaveWeightsAfterSteps(0, dir_output, _config)] if save_interval: @@ -657,5 +643,3 @@ def run(_config, model_dir = os.path.join(dir_out,'model_best') model.save(model_dir) ''' - - diff --git a/src/eynollah/training/weights_ensembling.py b/src/eynollah/training/weights_ensembling.py index 6dce7fd..01532fd 100644 --- a/src/eynollah/training/weights_ensembling.py +++ b/src/eynollah/training/weights_ensembling.py @@ -1,136 +1,66 @@ -import sys -from glob import glob -from os import environ, devnull -from os.path import join -from warnings import catch_warnings, simplefilter import os +from warnings import catch_warnings, simplefilter +import click import numpy as np -from PIL import Image -import cv2 -environ['TF_CPP_MIN_LOG_LEVEL'] = '3' -stderr = sys.stderr -sys.stderr = open(devnull, 'w') + +os.environ['TF_USE_LEGACY_KERAS'] = '1' # avoid Keras 3 after TF 2.15 +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' + +from ocrd_utils import tf_disable_interactive_logs +tf_disable_interactive_logs() import tensorflow as tf from tensorflow.keras.models import load_model -from tensorflow.python.keras import backend as tensorflow_backend -sys.stderr = stderr -from tensorflow.keras import layers -import tensorflow.keras.losses -from tensorflow.keras.layers import * -import click -import logging - -class Patches(layers.Layer): - def __init__(self, patch_size_x, patch_size_y): - super(Patches, self).__init__() - self.patch_size_x = patch_size_x - self.patch_size_y = patch_size_y - - def call(self, images): - #print(tf.shape(images)[1],'images') - #print(self.patch_size,'self.patch_size') - batch_size = tf.shape(images)[0] - patches = tf.image.extract_patches( - images=images, - sizes=[1, self.patch_size_y, self.patch_size_x, 1], - strides=[1, self.patch_size_y, self.patch_size_x, 1], - rates=[1, 1, 1, 1], - padding="VALID", - ) - #patch_dims = patches.shape[-1] - patch_dims = tf.shape(patches)[-1] - patches = tf.reshape(patches, [batch_size, -1, patch_dims]) - return patches - def get_config(self): - - config = super().get_config().copy() - config.update({ - 'patch_size_x': self.patch_size_x, - 'patch_size_y': self.patch_size_y, - }) - return config - - - -class PatchEncoder(layers.Layer): - def __init__(self, **kwargs): - super(PatchEncoder, self).__init__() - self.num_patches = num_patches - self.projection = layers.Dense(units=projection_dim) - self.position_embedding = layers.Embedding( - input_dim=num_patches, output_dim=projection_dim - ) - - def call(self, patch): - positions = tf.range(start=0, limit=self.num_patches, delta=1) - encoded = self.projection(patch) + self.position_embedding(positions) - return encoded - def get_config(self): - - config = super().get_config().copy() - config.update({ - 'num_patches': self.num_patches, - 'projection': self.projection, - 'position_embedding': self.position_embedding, - }) - return config +from .models import ( + PatchEncoder, + Patches, +) - -def start_new_session(): - ###config = tf.compat.v1.ConfigProto() - ###config.gpu_options.allow_growth = True +def run_ensembling(model_dirs, out_dir): + all_weights = [] - ###self.session = tf.compat.v1.Session(config=config) # tf.InteractiveSession() - ###tensorflow_backend.set_session(self.session) - - config = tf.compat.v1.ConfigProto() - config.gpu_options.allow_growth = True - - session = tf.compat.v1.Session(config=config) # tf.InteractiveSession() - tensorflow_backend.set_session(session) - return session - -def run_ensembling(dir_models, out): - ls_models = os.listdir(dir_models) - - - weights=[] - - for model_name in ls_models: - model = load_model(os.path.join(dir_models,model_name) , compile=False, custom_objects={'PatchEncoder':PatchEncoder, 'Patches': Patches}) - weights.append(model.get_weights()) + for model_dir in model_dirs: + assert os.path.isdir(model_dir), model_dir + model = load_model(model_dir, compile=False, + custom_objects=dict(PatchEncoder=PatchEncoder, + Patches=Patches)) + all_weights.append(model.get_weights()) - new_weights = list() + new_weights = [] + for layer_weights in zip(*all_weights): + layer_weights = np.array([np.array(weights).mean(axis=0) + for weights in zip(*layer_weights)]) + new_weights.append(layer_weights) - for weights_list_tuple in zip(*weights): - new_weights.append( - [np.array(weights_).mean(axis=0)\ - for weights_ in zip(*weights_list_tuple)]) - - - - new_weights = [np.array(x) for x in new_weights] - + #model = tf.keras.models.clone_model(model) model.set_weights(new_weights) - model.save(out) - os.system('cp '+os.path.join(os.path.join(dir_models,model_name) , "config.json ")+out) + + model.save(out_dir) + os.system('cp ' + os.path.join(model_dirs[0], "config.json ") + out_dir + "/") @click.command() @click.option( - "--dir_models", - "-dm", - help="directory of models", + "--in", + "-i", + help="input directory of checkpoint models to be read", + multiple=True, + required=True, type=click.Path(exists=True, file_okay=False), ) @click.option( "--out", "-o", help="output directory where ensembled model will be written.", + required=True, type=click.Path(exists=False, file_okay=False), ) +def ensemble_cli(in_, out): + """ + mix multiple model weights + + Load a sequence of models and mix them into a single ensemble model + by averaging their weights. Write the resulting model. + """ + run_ensembling(in_, out) -def main(dir_models, out): - run_ensembling(dir_models, out) - From 2492c257c6a81955915b8175344027cbd4d355d5 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 7 Feb 2026 16:52:54 +0100 Subject: [PATCH 71/91] ocrd-tool.json: re-instante light_version and textline_light dummies for backwards compatibility --- src/eynollah/ocrd-tool.json | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/src/eynollah/ocrd-tool.json b/src/eynollah/ocrd-tool.json index 3b500fc..fc61af7 100644 --- a/src/eynollah/ocrd-tool.json +++ b/src/eynollah/ocrd-tool.json @@ -28,7 +28,19 @@ "full_layout": { "type": "boolean", "default": true, - "description": "Try to detect all element subtypes, including drop-caps and headings" + "description": "Try to detect all region subtypes, including drop-capital and heading" + }, + "light_version": { + "type": "boolean", + "default": true, + "enum": [true], + "description": "ignored (only for backwards-compatibility)" + }, + "textline_light": { + "type": "boolean", + "default": true, + "enum": [true], + "description": "ignored (only for backwards-compatibility)" }, "tables": { "type": "boolean", @@ -38,12 +50,12 @@ "curved_line": { "type": "boolean", "default": false, - "description": "try to return contour of textlines instead of just rectangle bounding box. Needs more processing time" + "description": "retrieve textline polygons independent of each other (needs more processing time)" }, "ignore_page_extraction": { "type": "boolean", "default": false, - "description": "if this parameter set to true, this tool would ignore page extraction" + "description": "if true, do not attempt page frame detection (cropping)" }, "allow_scaling": { "type": "boolean", @@ -58,7 +70,7 @@ "right_to_left": { "type": "boolean", "default": false, - "description": "if this parameter set to true, this tool will extract right-to-left reading order." + "description": "if true, return reading order in right-to-left reading direction." }, "headers_off": { "type": "boolean", @@ -123,13 +135,22 @@ } }, "resources": [ + { + "url": "https://zenodo.org/records/17580627/files/models_all_v0_7_0.zip?download=1", + "name": "models_layout_v0_7_0", + "type": "archive", + "size": 6119874002, + "description": "Models for layout detection, reading order detection, textline detection, page extraction, column classification, table detection, binarization, image enhancement and OCR", + "version_range": ">= v0.7.0" + }, { "url": "https://github.com/qurator-spk/sbb_binarization/releases/download/v0.0.11/saved_model_2020_01_16.zip", "name": "default", "type": "archive", "path_in_archive": "saved_model_2020_01_16", "size": 563147331, - "description": "default models provided by github.com/qurator-spk (SavedModel format)" + "description": "default models provided by github.com/qurator-spk (SavedModel format)", + "version_range": "< v0.7.0" }, { "url": "https://github.com/qurator-spk/sbb_binarization/releases/download/v0.0.11/saved_model_2021_03_09.zip", @@ -137,7 +158,8 @@ "type": "archive", "path_in_archive": ".", "size": 133230419, - "description": "updated default models provided by github.com/qurator-spk (SavedModel format)" + "description": "updated default models provided by github.com/qurator-spk (SavedModel format)", + "version_range": "< v0.7.0" } ] } From ea285124ce11aa9c00d02d2e939803a067931a61 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 8 Feb 2026 01:06:57 +0100 Subject: [PATCH 72/91] fix Patches/PatchEncoder (make configurable again) --- src/eynollah/patch_encoder.py | 52 ++++++++++++++------------------- src/eynollah/training/models.py | 22 +++----------- 2 files changed, 26 insertions(+), 48 deletions(-) diff --git a/src/eynollah/patch_encoder.py b/src/eynollah/patch_encoder.py index dc0a291..07b843d 100644 --- a/src/eynollah/patch_encoder.py +++ b/src/eynollah/patch_encoder.py @@ -3,52 +3,44 @@ os.environ['TF_USE_LEGACY_KERAS'] = '1' # avoid Keras 3 after TF 2.15 import tensorflow as tf from tensorflow.keras import layers -projection_dim = 64 -patch_size = 1 -num_patches =21*21#14*14#28*28#14*14#28*28 - class PatchEncoder(layers.Layer): - def __init__(self): + # 441=21*21 # 14*14 # 28*28 + def __init__(self, num_patches=441, projection_dim=64): super().__init__() - self.projection = layers.Dense(units=projection_dim) - self.position_embedding = layers.Embedding(input_dim=num_patches, output_dim=projection_dim) + self.num_patches = num_patches + self.projection_dim = projection_dim + self.projection = layers.Dense(self.projection_dim) + self.position_embedding = layers.Embedding(self.num_patches, self.projection_dim) def call(self, patch): - positions = tf.range(start=0, limit=num_patches, delta=1) - encoded = self.projection(patch) + self.position_embedding(positions) - return encoded + positions = tf.range(start=0, limit=self.num_patches, delta=1) + return self.projection(patch) + self.position_embedding(positions) def get_config(self): - config = super().get_config().copy() - config.update({ - 'num_patches': num_patches, - 'projection': self.projection, - 'position_embedding': self.position_embedding, - }) - return config + return dict(num_patches=self.num_patches, + projection_dim=self.projection_dim, + **super().get_config()) class Patches(layers.Layer): - def __init__(self, **kwargs): - super(Patches, self).__init__() - self.patch_size = patch_size + def __init__(self, patch_size_x=1, patch_size_y=1): + super().__init__() + self.patch_size_x = patch_size_x + self.patch_size_y = patch_size_y def call(self, images): batch_size = tf.shape(images)[0] patches = tf.image.extract_patches( images=images, - sizes=[1, self.patch_size, self.patch_size, 1], - strides=[1, self.patch_size, self.patch_size, 1], + sizes=[1, self.patch_size_y, self.patch_size_x, 1], + strides=[1, self.patch_size_y, self.patch_size_x, 1], rates=[1, 1, 1, 1], padding="VALID", ) patch_dims = patches.shape[-1] - patches = tf.reshape(patches, [batch_size, -1, patch_dims]) - return patches - def get_config(self): + return tf.reshape(patches, [batch_size, -1, patch_dims]) - config = super().get_config().copy() - config.update({ - 'patch_size': self.patch_size, - }) - return config + def get_config(self): + return dict(patch_size_x=self.patch_size_x, + patch_size_y=self.patch_size_y, + **super().get_config()) diff --git a/src/eynollah/training/models.py b/src/eynollah/training/models.py index d1148f1..b0ad51c 100644 --- a/src/eynollah/training/models.py +++ b/src/eynollah/training/models.py @@ -423,16 +423,9 @@ def vit_resnet50_unet(num_patches, #num_patches = x.shape[1]*x.shape[2] - # rs: fixme patch size not configurable anymore... - #patches = Patches(transformer_patchsize_x, transformer_patchsize_y)(inputs) - patches = Patches()(x) - assert transformer_patchsize_x == transformer_patchsize_y == 1 + patches = Patches(transformer_patchsize_x, transformer_patchsize_y)(x) # Encode patches. - # rs: fixme num patches and dim not configurable anymore... - #encoded_patches = PatchEncoder(num_patches, transformer_projection_dim)(patches) - encoded_patches = PatchEncoder()(patches) - assert num_patches == 21 * 21 - assert transformer_projection_dim == 64 + encoded_patches = PatchEncoder(num_patches, transformer_projection_dim)(patches) for _ in range(transformer_layers): # Layer normalization 1. @@ -530,16 +523,9 @@ def vit_resnet50_unet_transformer_before_cnn(num_patches, IMAGE_ORDERING = 'channels_last' bn_axis=3 - # rs: fixme patch size not configurable anymore... - #patches = Patches(transformer_patchsize_x, transformer_patchsize_y)(inputs) - patches = Patches()(inputs) - assert transformer_patchsize_x == transformer_patchsize_y == 1 + patches = Patches(transformer_patchsize_x, transformer_patchsize_y)(inputs) # Encode patches. - # rs: fixme num patches and dim not configurable anymore... - #encoded_patches = PatchEncoder(num_patches, transformer_projection_dim)(patches) - encoded_patches = PatchEncoder()(patches) - assert num_patches == 21 * 21 - assert transformer_projection_dim == 64 + encoded_patches = PatchEncoder(num_patches, transformer_projection_dim)(patches) for _ in range(transformer_layers): # Layer normalization 1. From 53252a59c6619bbf0d164a8c7fc6c98449b208ec Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 8 Feb 2026 01:09:40 +0100 Subject: [PATCH 73/91] training.models: fix glitch introduced in 3a73ccca --- src/eynollah/training/models.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/eynollah/training/models.py b/src/eynollah/training/models.py index b0ad51c..5b23ecd 100644 --- a/src/eynollah/training/models.py +++ b/src/eynollah/training/models.py @@ -443,7 +443,6 @@ def vit_resnet50_unet(num_patches, # Skip connection 2. encoded_patches = Add()([x3, x2]) - assert isinstance(x, Layer) encoded_patches = tf.reshape(encoded_patches, [-1, x.shape[1], x.shape[2], transformer_projection_dim // (transformer_patchsize_x * From ee4bffd81d211697b608b93bf2a3986de1f4ed85 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 8 Feb 2026 01:10:13 +0100 Subject: [PATCH 74/91] training.train: simplify transformer cfg checks --- src/eynollah/training/train.py | 37 +++++++++++++++------------------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index 217ab35..ecf70b4 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -385,31 +385,26 @@ def run(_config, if transformer_cnn_first: model_builder = vit_resnet50_unet - multiple_of_32 = True + multiple = 32 else: model_builder = vit_resnet50_unet_transformer_before_cnn - multiple_of_32 = False + multiple = 1 - assert input_height == (num_patches_y * - transformer_patchsize_y * - (32 if multiple_of_32 else 1)), \ - "transformer_patchsize_y or transformer_num_patches_xy height value error: " \ - "input_height should be equal to " \ - "(transformer_num_patches_xy height value * transformer_patchsize_y%s)" % \ - " * 32" if multiple_of_32 else "" - assert input_width == (num_patches_x * - transformer_patchsize_x * - (32 if multiple_of_32 else 1)), \ - "transformer_patchsize_x or transformer_num_patches_xy width value error: " \ - "input_width should be equal to " \ - "(transformer_num_patches_xy width value * transformer_patchsize_x%s)" % \ - " * 32" if multiple_of_32 else "" + assert input_height == ( + num_patches_y * transformer_patchsize_y * multiple), ( + "transformer_patchsize_y or transformer_num_patches_xy height value error: " + "input_height should be equal to " + "(transformer_num_patches_xy height value * transformer_patchsize_y * %d)" % multiple) + assert input_width == ( + num_patches_x * transformer_patchsize_x * multiple), ( + "transformer_patchsize_x or transformer_num_patches_xy width value error: " + "input_width should be equal to " + "(transformer_num_patches_xy width value * transformer_patchsize_x * %d)" % multiple) assert 0 == (transformer_projection_dim % - (transformer_patchsize_y * - transformer_patchsize_x)), \ - "transformer_projection_dim error: " \ - "The remainder when parameter transformer_projection_dim is divided by " \ - "(transformer_patchsize_y*transformer_patchsize_x) should be zero" + (transformer_patchsize_y * transformer_patchsize_x)), ( + "transformer_projection_dim error: " + "The remainder when parameter transformer_projection_dim is divided by " + "(transformer_patchsize_y*transformer_patchsize_x) should be zero") model_builder = create_captured_function(model_builder) model_builder.config = _config From 7b7ef041ec397fe89c62f9bc1be843b09285f941 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 8 Feb 2026 01:10:56 +0100 Subject: [PATCH 75/91] training.models: use asymmetric zero padding instead of lambda layer --- src/eynollah/training/models.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/eynollah/training/models.py b/src/eynollah/training/models.py index 5b23ecd..115a196 100644 --- a/src/eynollah/training/models.py +++ b/src/eynollah/training/models.py @@ -69,16 +69,9 @@ def mlp(x, hidden_units, dropout_rate): return x def one_side_pad(x): - # rs: fixme: lambda layers are problematic for de/serialization! - # - can we use ZeroPadding1D instead of ZeroPadding2D+Lambda? - x = ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING)(x) - if IMAGE_ORDERING == 'channels_first': - x = Lambda(lambda x: x[:, :, :-1, :-1])(x) - elif IMAGE_ORDERING == 'channels_last': - x = Lambda(lambda x: x[:, :-1, :-1, :])(x) + x = ZeroPadding2D(((1, 0), (1, 0)), data_format=IMAGE_ORDERING)(x) return x - def identity_block(input_tensor, kernel_size, filters, stage, block): """The identity block is the block that has no conv layer at shortcut. # Arguments From 37338049af618383ca2f2c6708dd91b294b77872 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 8 Feb 2026 01:11:44 +0100 Subject: [PATCH 76/91] training: use relative imports --- src/eynollah/training/inference.py | 2 +- src/eynollah/training/models.py | 2 +- src/eynollah/training/weights_ensembling.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/eynollah/training/inference.py b/src/eynollah/training/inference.py index 2b26210..c38b79f 100644 --- a/src/eynollah/training/inference.py +++ b/src/eynollah/training/inference.py @@ -25,7 +25,7 @@ from .gt_gen_utils import ( resize_image, update_list_and_return_first_with_length_bigger_than_one ) -from .models import ( +from ..patch_encoder import ( PatchEncoder, Patches ) diff --git a/src/eynollah/training/models.py b/src/eynollah/training/models.py index 115a196..6182c9e 100644 --- a/src/eynollah/training/models.py +++ b/src/eynollah/training/models.py @@ -30,7 +30,7 @@ from tensorflow.keras.layers import ( from tensorflow.keras.models import Model from tensorflow.keras.regularizers import l2 -from eynollah.patch_encoder import Patches, PatchEncoder +from ..patch_encoder import Patches, PatchEncoder ##mlp_head_units = [512, 256]#[2048, 1024] ###projection_dim = 64 diff --git a/src/eynollah/training/weights_ensembling.py b/src/eynollah/training/weights_ensembling.py index 01532fd..e3ede24 100644 --- a/src/eynollah/training/weights_ensembling.py +++ b/src/eynollah/training/weights_ensembling.py @@ -12,7 +12,7 @@ tf_disable_interactive_logs() import tensorflow as tf from tensorflow.keras.models import load_model -from .models import ( +from ..patch_encoder import ( PatchEncoder, Patches, ) From 514a897dd5392bf7a160bf02c82b04da1fc53bb0 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 8 Feb 2026 01:11:57 +0100 Subject: [PATCH 77/91] training.train: assert n_epochs vs. index_start --- src/eynollah/training/train.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index ecf70b4..73d5e0b 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -275,6 +275,9 @@ def run(_config, run configured experiment via sacred """ + if continue_training: + assert n_epochs > index_start, "with continue_training, n_epochs must be greater than index_start" + if pretraining and not os.path.isfile(RESNET50_WEIGHTS_PATH): _log.info("downloading RESNET50 pretrained weights to %s", RESNET50_WEIGHTS_PATH) download_file(RESNET50_WEIGHTS_URL, RESNET50_WEIGHTS_PATH) From 83c2408192950f472e7c8960170cb270ba1a63af Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 8 Feb 2026 01:25:53 +0100 Subject: [PATCH 78/91] training.utils.data_gen: avoid repeated array allocation --- src/eynollah/training/utils.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/eynollah/training/utils.py b/src/eynollah/training/utils.py index 56d6bdf..a03d539 100644 --- a/src/eynollah/training/utils.py +++ b/src/eynollah/training/utils.py @@ -600,10 +600,9 @@ def data_gen(img_folder, mask_folder, batch_size, input_height, input_width, n_c c = 0 n = [f for f in os.listdir(img_folder) if not f.startswith('.')] # os.listdir(img_folder) #List of training images random.shuffle(n) + img = np.zeros((batch_size, input_height, input_width, 3), dtype=float) + mask = np.zeros((batch_size, input_height, input_width, n_classes), dtype=float) while True: - img = np.zeros((batch_size, input_height, input_width, 3)).astype('float') - mask = np.zeros((batch_size, input_height, input_width, n_classes)).astype('float') - for i in range(c, c + batch_size): # initially from 0 to 16, c = 0. try: filename = os.path.splitext(n[i])[0] @@ -612,21 +611,22 @@ def data_gen(img_folder, mask_folder, batch_size, input_height, input_width, n_c train_img = cv2.resize(train_img, (input_width, input_height), interpolation=cv2.INTER_NEAREST) # Read an image from folder and resize - img[i - c] = train_img # add to array - img[0], img[1], and so on. + img[i - c, :] = train_img # add to array - img[0], img[1], and so on. if task == "segmentation" or task=="binarization": train_mask = cv2.imread(mask_folder + '/' + filename + '.png') - train_mask = get_one_hot(resize_image(train_mask, input_height, input_width), input_height, input_width, - n_classes) + train_mask = resize_image(train_mask, input_height, input_width) + train_mask = get_one_hot(train_mask, input_height, input_width, n_classes) elif task == "enhancement": train_mask = cv2.imread(mask_folder + '/' + filename + '.png')/255. train_mask = resize_image(train_mask, input_height, input_width) # train_mask = train_mask.reshape(224, 224, 1) # Add extra dimension for parity with train_img size [512 * 512 * 3] - mask[i - c] = train_mask - except: - img[i - c] = np.ones((input_height, input_width, 3)).astype('float') - mask[i - c] = np.zeros((input_height, input_width, n_classes)).astype('float') + mask[i - c, :] = train_mask + except Exception as e: + print(str(e)) + img[i - c, :] = 1. + mask[i - c, :] = 0. c += batch_size if c + batch_size >= len(os.listdir(img_folder)): From 7888fa5968d12bf5d485705b90c805f922997d89 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 8 Feb 2026 04:42:44 +0100 Subject: [PATCH 79/91] training: remove `data_gen` in favor of tf.data pipelines instead of looping over file pairs indefinitely, yielding Numpy arrays: re-use `keras.utils.image_dataset_from_directory` here as well, but with img/label generators zipped together (thus, everything will already be loaded/prefetched on the GPU) --- src/eynollah/training/train.py | 61 ++++++++++++++++++---------------- src/eynollah/training/utils.py | 38 --------------------- 2 files changed, 32 insertions(+), 67 deletions(-) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index 73d5e0b..05a7346 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -13,6 +13,7 @@ from tensorflow.keras.models import load_model from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard from tensorflow.keras.layers import StringLookup from tensorflow.keras.utils import image_dataset_from_directory +from tensorflow.keras.backend import one_hot from sacred import Experiment from sacred.config import create_captured_function @@ -36,7 +37,6 @@ from .models import ( RESNET50_WEIGHTS_URL ) from .utils import ( - data_gen, generate_arrays_from_folder_reading_order, get_one_hot, preprocess_imgs, @@ -435,43 +435,46 @@ def run(_config, sparse_y_true=False, sparse_y_pred=False)]) - # generating train and evaluation data - gen_kwargs = dict(batch_size=n_batch, - input_height=input_height, - input_width=input_width, - n_classes=n_classes, - task=task) - train_gen = data_gen(dir_flow_train_imgs, dir_flow_train_labels, **gen_kwargs) - val_gen = data_gen(dir_flow_eval_imgs, dir_flow_eval_labels, **gen_kwargs) - - ##img_validation_patches = os.listdir(dir_flow_eval_imgs) - ##score_best=[] - ##score_best.append(0) + def get_dataset(dir_imgs, dir_labs, shuffle=None): + gen_kwargs = dict(labels=None, + label_mode=None, + batch_size=1, # batch after zip below + image_size=(input_height, input_width), + color_mode='rgb', + shuffle=shuffle is not None, + seed=shuffle, + interpolation='nearest', + crop_to_aspect_ratio=False, + # Keras 3 only... + #pad_to_aspect_ratio=False, + #data_format='channel_last', + #verbose=False, + ) + img_gen = image_dataset_from_directory(dir_imgs, **gen_kwargs) + lab_gen = image_dataset_from_directory(dir_labs, **gen_kwargs) + if task in ["segmentation", "binarization"]: + @tf.function + def to_categorical(seg): + seg = tf.image.rgb_to_grayscale(seg) + seg = tf.cast(seg, tf.int8) + seg = tf.squeeze(seg, axis=-1) + return one_hot(seg, n_classes) + lab_gen = lab_gen.map(to_categorical) + return tf.data.Dataset.zip(img_gen, lab_gen).rebatch(n_batch, drop_remainder=True) + train_gen = get_dataset(dir_flow_train_imgs, dir_flow_train_labels, shuffle=np.random.randint(1e6)) + val_gen = get_dataset(dir_flow_eval_imgs, dir_flow_eval_labels) callbacks = [TensorBoard(os.path.join(dir_output, 'logs'), write_graph=False), SaveWeightsAfterSteps(0, dir_output, _config)] if save_interval: callbacks.append(SaveWeightsAfterSteps(save_interval, dir_output, _config)) - - steps_train = len(os.listdir(dir_flow_train_imgs)) // n_batch # - 1 - steps_val = len(os.listdir(dir_flow_eval_imgs)) // n_batch - _log.info("training on %d batches in %d epochs", steps_train, n_epochs) - _log.info("validating on %d batches", steps_val) model.fit( - train_gen, - steps_per_epoch=steps_train, - validation_data=val_gen, - #validation_steps=1, # rs: only one batch?? - validation_steps=steps_val, + train_gen.prefetch(tf.data.AUTOTUNE), # .repeat()?? + validation_data=val_gen.prefetch(tf.data.AUTOTUNE), epochs=n_epochs, callbacks=callbacks, initial_epoch=index_start) - #os.system('rm -rf '+dir_train_flowing) - #os.system('rm -rf '+dir_eval_flowing) - - #model.save(dir_output+'/'+'model'+'.h5') - elif task=="cnn-rnn-ocr": dir_img, dir_lab = get_dirs_or_files(dir_train) @@ -524,7 +527,7 @@ def run(_config, drop_remainder=True, #num_parallel_calls=tf.data.AUTOTUNE, ) - train_ds = train_ds.repeat().shuffle().prefetch(20) + train_ds = train_ds.prefetch(tf.data.AUTOTUNE) #initial_learning_rate = 1e-4 #decay_steps = int (n_epochs * ( len_dataset / n_batch )) diff --git a/src/eynollah/training/utils.py b/src/eynollah/training/utils.py index a03d539..f2f4bdc 100644 --- a/src/eynollah/training/utils.py +++ b/src/eynollah/training/utils.py @@ -596,44 +596,6 @@ def generate_arrays_from_folder_reading_order(classes_file_dir, modal_dir, n_bat ret_y= np.zeros((n_batch, n_classes)).astype(np.int16) batchcount = 0 -def data_gen(img_folder, mask_folder, batch_size, input_height, input_width, n_classes, task='segmentation'): - c = 0 - n = [f for f in os.listdir(img_folder) if not f.startswith('.')] # os.listdir(img_folder) #List of training images - random.shuffle(n) - img = np.zeros((batch_size, input_height, input_width, 3), dtype=float) - mask = np.zeros((batch_size, input_height, input_width, n_classes), dtype=float) - while True: - for i in range(c, c + batch_size): # initially from 0 to 16, c = 0. - try: - filename = os.path.splitext(n[i])[0] - - train_img = cv2.imread(img_folder + '/' + n[i]) / 255. - train_img = cv2.resize(train_img, (input_width, input_height), - interpolation=cv2.INTER_NEAREST) # Read an image from folder and resize - - img[i - c, :] = train_img # add to array - img[0], img[1], and so on. - if task == "segmentation" or task=="binarization": - train_mask = cv2.imread(mask_folder + '/' + filename + '.png') - train_mask = resize_image(train_mask, input_height, input_width) - train_mask = get_one_hot(train_mask, input_height, input_width, n_classes) - elif task == "enhancement": - train_mask = cv2.imread(mask_folder + '/' + filename + '.png')/255. - train_mask = resize_image(train_mask, input_height, input_width) - - # train_mask = train_mask.reshape(224, 224, 1) # Add extra dimension for parity with train_img size [512 * 512 * 3] - - mask[i - c, :] = train_mask - except Exception as e: - print(str(e)) - img[i - c, :] = 1. - mask[i - c, :] = 0. - - c += batch_size - if c + batch_size >= len(os.listdir(img_folder)): - c = 0 - random.shuffle(n) - yield img, mask - # TODO: Use otsu_copy from utils def otsu_copy(img): From 4414f7b89b4e1488a6955bb40342709ab05c0414 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 17 Feb 2026 14:18:32 +0100 Subject: [PATCH 80/91] training.models.vit_resnet50_unet: re-use `IMAGE_ORDERING` --- src/eynollah/training/models.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/src/eynollah/training/models.py b/src/eynollah/training/models.py index 6182c9e..0dc78d2 100644 --- a/src/eynollah/training/models.py +++ b/src/eynollah/training/models.py @@ -372,12 +372,10 @@ def vit_resnet50_unet(num_patches, transformer_mlp_head_units = [128, 64] inputs = Input(shape=(input_height, input_width, 3)) - #transformer_units = [ - #projection_dim * 2, - #projection_dim, - #] # Size of the transformer layers - IMAGE_ORDERING = 'channels_last' - bn_axis=3 + if IMAGE_ORDERING == 'channels_last': + bn_axis = 3 + else: + bn_axis = 1 x = ZeroPadding2D((3, 3), data_format=IMAGE_ORDERING)(inputs) x = Conv2D(64, (7, 7), data_format=IMAGE_ORDERING, strides=(2, 2),kernel_regularizer=l2(weight_decay), name='conv1')(x) @@ -508,12 +506,10 @@ def vit_resnet50_unet_transformer_before_cnn(num_patches, transformer_mlp_head_units = [128, 64] inputs = Input(shape=(input_height, input_width, 3)) - ##transformer_units = [ - ##projection_dim * 2, - ##projection_dim, - ##] # Size of the transformer layers - IMAGE_ORDERING = 'channels_last' - bn_axis=3 + if IMAGE_ORDERING == 'channels_last': + bn_axis = 3 + else: + bn_axis = 1 patches = Patches(transformer_patchsize_x, transformer_patchsize_y)(inputs) # Encode patches. From fcd10c39567376675ec77500ab12645b77cf2c68 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 17 Feb 2026 14:52:04 +0100 Subject: [PATCH 81/91] training.models: re-use RESNET50 builder (+weight init) code --- src/eynollah/training/models.py | 223 +++----------------------------- 1 file changed, 21 insertions(+), 202 deletions(-) diff --git a/src/eynollah/training/models.py b/src/eynollah/training/models.py index 0dc78d2..406e937 100644 --- a/src/eynollah/training/models.py +++ b/src/eynollah/training/models.py @@ -154,19 +154,13 @@ def conv_block(input_tensor, kernel_size, filters, stage, block, strides=(2, 2)) x = Activation('relu')(x) return x - -def resnet50_unet_light(n_classes, input_height=224, input_width=224, task="segmentation", weight_decay=1e-6, pretraining=False): - assert input_height % 32 == 0 - assert input_width % 32 == 0 - - img_input = Input(shape=(input_height, input_width, 3)) - +def resnet50(inputs, weight_decay=1e-6, pretraining=False): if IMAGE_ORDERING == 'channels_last': bn_axis = 3 else: bn_axis = 1 - x = ZeroPadding2D((3, 3), data_format=IMAGE_ORDERING)(img_input) + x = ZeroPadding2D((3, 3), data_format=IMAGE_ORDERING)(inputs) x = Conv2D(64, (7, 7), data_format=IMAGE_ORDERING, strides=(2, 2), kernel_regularizer=l2(weight_decay), name='conv1')(x) f1 = x @@ -200,7 +194,17 @@ def resnet50_unet_light(n_classes, input_height=224, input_width=224, task="segm f5 = x if pretraining: - model = Model(img_input, x).load_weights(RESNET50_WEIGHTS_PATH) + model = Model(inputs, x).load_weights(RESNET50_WEIGHTS_PATH) + + return f1, f2, f3, f4, f5 + +def resnet50_unet_light(n_classes, input_height=224, input_width=224, task="segmentation", weight_decay=1e-6, pretraining=False): + assert input_height % 32 == 0 + assert input_width % 32 == 0 + + img_input = Input(shape=(input_height, input_width, 3)) + + f1, f2, f3, f4, f5 = resnet50(img_input, weight_decay, pretraining) v512_2048 = Conv2D(512, (1, 1), padding='same', data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay))(f5) v512_2048 = (BatchNormalization(axis=bn_axis))(v512_2048) @@ -262,46 +266,7 @@ def resnet50_unet(n_classes, input_height=224, input_width=224, task="segmentati img_input = Input(shape=(input_height, input_width, 3)) - if IMAGE_ORDERING == 'channels_last': - bn_axis = 3 - else: - bn_axis = 1 - - x = ZeroPadding2D((3, 3), data_format=IMAGE_ORDERING)(img_input) - x = Conv2D(64, (7, 7), data_format=IMAGE_ORDERING, strides=(2, 2), kernel_regularizer=l2(weight_decay), - name='conv1')(x) - f1 = x - - x = BatchNormalization(axis=bn_axis, name='bn_conv1')(x) - x = Activation('relu')(x) - x = MaxPooling2D((3, 3), data_format=IMAGE_ORDERING, strides=(2, 2))(x) - - x = conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1)) - x = identity_block(x, 3, [64, 64, 256], stage=2, block='b') - x = identity_block(x, 3, [64, 64, 256], stage=2, block='c') - f2 = one_side_pad(x) - - x = conv_block(x, 3, [128, 128, 512], stage=3, block='a') - x = identity_block(x, 3, [128, 128, 512], stage=3, block='b') - x = identity_block(x, 3, [128, 128, 512], stage=3, block='c') - x = identity_block(x, 3, [128, 128, 512], stage=3, block='d') - f3 = x - - x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a') - x = identity_block(x, 3, [256, 256, 1024], stage=4, block='b') - x = identity_block(x, 3, [256, 256, 1024], stage=4, block='c') - x = identity_block(x, 3, [256, 256, 1024], stage=4, block='d') - x = identity_block(x, 3, [256, 256, 1024], stage=4, block='e') - x = identity_block(x, 3, [256, 256, 1024], stage=4, block='f') - f4 = x - - x = conv_block(x, 3, [512, 512, 2048], stage=5, block='a') - x = identity_block(x, 3, [512, 512, 2048], stage=5, block='b') - x = identity_block(x, 3, [512, 512, 2048], stage=5, block='c') - f5 = x - - if pretraining: - Model(img_input, x).load_weights(RESNET50_WEIGHTS_PATH) + f1, f2, f3, f4, f5 = resnet50(img_input, weight_decay, pretraining) v1024_2048 = Conv2D(1024, (1, 1), padding='same', data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay))( f5) @@ -372,47 +337,7 @@ def vit_resnet50_unet(num_patches, transformer_mlp_head_units = [128, 64] inputs = Input(shape=(input_height, input_width, 3)) - if IMAGE_ORDERING == 'channels_last': - bn_axis = 3 - else: - bn_axis = 1 - - x = ZeroPadding2D((3, 3), data_format=IMAGE_ORDERING)(inputs) - x = Conv2D(64, (7, 7), data_format=IMAGE_ORDERING, strides=(2, 2),kernel_regularizer=l2(weight_decay), name='conv1')(x) - f1 = x - - x = BatchNormalization(axis=bn_axis, name='bn_conv1')(x) - x = Activation('relu')(x) - x = MaxPooling2D((3, 3), data_format=IMAGE_ORDERING, strides=(2, 2))(x) - - x = conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1)) - x = identity_block(x, 3, [64, 64, 256], stage=2, block='b') - x = identity_block(x, 3, [64, 64, 256], stage=2, block='c') - f2 = one_side_pad(x) - - x = conv_block(x, 3, [128, 128, 512], stage=3, block='a') - x = identity_block(x, 3, [128, 128, 512], stage=3, block='b') - x = identity_block(x, 3, [128, 128, 512], stage=3, block='c') - x = identity_block(x, 3, [128, 128, 512], stage=3, block='d') - f3 = x - - x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a') - x = identity_block(x, 3, [256, 256, 1024], stage=4, block='b') - x = identity_block(x, 3, [256, 256, 1024], stage=4, block='c') - x = identity_block(x, 3, [256, 256, 1024], stage=4, block='d') - x = identity_block(x, 3, [256, 256, 1024], stage=4, block='e') - x = identity_block(x, 3, [256, 256, 1024], stage=4, block='f') - f4 = x - - x = conv_block(x, 3, [512, 512, 2048], stage=5, block='a') - x = identity_block(x, 3, [512, 512, 2048], stage=5, block='b') - x = identity_block(x, 3, [512, 512, 2048], stage=5, block='c') - f5 = x - - if pretraining: - model = Model(inputs, x).load_weights(RESNET50_WEIGHTS_PATH) - - #num_patches = x.shape[1]*x.shape[2] + f1, f2, f3, f4, f5 = resnet50(inputs, weight_decay, pretraining) patches = Patches(transformer_patchsize_x, transformer_patchsize_y)(x) # Encode patches. @@ -540,42 +465,9 @@ def vit_resnet50_unet_transformer_before_cnn(num_patches, encoded_patches = Conv2D(3, (1, 1), padding='same', data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay), name='convinput')(encoded_patches) - x = ZeroPadding2D((3, 3), data_format=IMAGE_ORDERING)(encoded_patches) - x = Conv2D(64, (7, 7), data_format=IMAGE_ORDERING, strides=(2, 2),kernel_regularizer=l2(weight_decay), name='conv1')(x) - f1 = x + f1, f2, f3, f4, f5 = resnet50(encoded_patches, weight_decay, pretraining) - x = BatchNormalization(axis=bn_axis, name='bn_conv1')(x) - x = Activation('relu')(x) - x = MaxPooling2D((3, 3), data_format=IMAGE_ORDERING, strides=(2, 2))(x) - - x = conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1)) - x = identity_block(x, 3, [64, 64, 256], stage=2, block='b') - x = identity_block(x, 3, [64, 64, 256], stage=2, block='c') - f2 = one_side_pad(x) - - x = conv_block(x, 3, [128, 128, 512], stage=3, block='a') - x = identity_block(x, 3, [128, 128, 512], stage=3, block='b') - x = identity_block(x, 3, [128, 128, 512], stage=3, block='c') - x = identity_block(x, 3, [128, 128, 512], stage=3, block='d') - f3 = x - - x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a') - x = identity_block(x, 3, [256, 256, 1024], stage=4, block='b') - x = identity_block(x, 3, [256, 256, 1024], stage=4, block='c') - x = identity_block(x, 3, [256, 256, 1024], stage=4, block='d') - x = identity_block(x, 3, [256, 256, 1024], stage=4, block='e') - x = identity_block(x, 3, [256, 256, 1024], stage=4, block='f') - f4 = x - - x = conv_block(x, 3, [512, 512, 2048], stage=5, block='a') - x = identity_block(x, 3, [512, 512, 2048], stage=5, block='b') - x = identity_block(x, 3, [512, 512, 2048], stage=5, block='c') - f5 = x - - if pretraining: - model = Model(encoded_patches, x).load_weights(RESNET50_WEIGHTS_PATH) - - v1024_2048 = Conv2D( 1024 , (1, 1), padding='same', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay))(x) + v1024_2048 = Conv2D( 1024 , (1, 1), padding='same', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay))(f5) v1024_2048 = (BatchNormalization(axis=bn_axis))(v1024_2048) v1024_2048 = Activation('relu')(v1024_2048) @@ -633,47 +525,7 @@ def resnet50_classifier(n_classes,input_height=224,input_width=224,weight_decay= img_input = Input(shape=(input_height,input_width , 3 )) - if IMAGE_ORDERING == 'channels_last': - bn_axis = 3 - else: - bn_axis = 1 - - x = ZeroPadding2D((3, 3), data_format=IMAGE_ORDERING)(img_input) - x = Conv2D(64, (7, 7), data_format=IMAGE_ORDERING, strides=(2, 2),kernel_regularizer=l2(weight_decay), name='conv1')(x) - f1 = x - - x = BatchNormalization(axis=bn_axis, name='bn_conv1')(x) - x = Activation('relu')(x) - x = MaxPooling2D((3, 3) , data_format=IMAGE_ORDERING , strides=(2, 2))(x) - - - x = conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1)) - x = identity_block(x, 3, [64, 64, 256], stage=2, block='b') - x = identity_block(x, 3, [64, 64, 256], stage=2, block='c') - f2 = one_side_pad(x ) - - - x = conv_block(x, 3, [128, 128, 512], stage=3, block='a') - x = identity_block(x, 3, [128, 128, 512], stage=3, block='b') - x = identity_block(x, 3, [128, 128, 512], stage=3, block='c') - x = identity_block(x, 3, [128, 128, 512], stage=3, block='d') - f3 = x - - x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a') - x = identity_block(x, 3, [256, 256, 1024], stage=4, block='b') - x = identity_block(x, 3, [256, 256, 1024], stage=4, block='c') - x = identity_block(x, 3, [256, 256, 1024], stage=4, block='d') - x = identity_block(x, 3, [256, 256, 1024], stage=4, block='e') - x = identity_block(x, 3, [256, 256, 1024], stage=4, block='f') - f4 = x - - x = conv_block(x, 3, [512, 512, 2048], stage=5, block='a') - x = identity_block(x, 3, [512, 512, 2048], stage=5, block='b') - x = identity_block(x, 3, [512, 512, 2048], stage=5, block='c') - f5 = x - - if pretraining: - Model(img_input, x).load_weights(RESNET50_WEIGHTS_PATH) + _, _, _, _, x = resnet50(img_input, weight_decay, pretraining) x = AveragePooling2D((7, 7), name='avg_pool')(x) x = Flatten()(x) @@ -693,43 +545,10 @@ def machine_based_reading_order_model(n_classes,input_height=224,input_width=224 img_input = Input(shape=(input_height,input_width , 3 )) - if IMAGE_ORDERING == 'channels_last': - bn_axis = 3 - else: - bn_axis = 1 - - x1 = ZeroPadding2D((3, 3), data_format=IMAGE_ORDERING)(img_input) - x1 = Conv2D(64, (7, 7), data_format=IMAGE_ORDERING, strides=(2, 2),kernel_regularizer=l2(weight_decay), name='conv1')(x1) - - x1 = BatchNormalization(axis=bn_axis, name='bn_conv1')(x1) - x1 = Activation('relu')(x1) - x1 = MaxPooling2D((3, 3) , data_format=IMAGE_ORDERING , strides=(2, 2))(x1) + _, _, _, _, x = resnet50(img_input, weight_decay, pretraining) - x1 = conv_block(x1, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1)) - x1 = identity_block(x1, 3, [64, 64, 256], stage=2, block='b') - x1 = identity_block(x1, 3, [64, 64, 256], stage=2, block='c') - - x1 = conv_block(x1, 3, [128, 128, 512], stage=3, block='a') - x1 = identity_block(x1, 3, [128, 128, 512], stage=3, block='b') - x1 = identity_block(x1, 3, [128, 128, 512], stage=3, block='c') - x1 = identity_block(x1, 3, [128, 128, 512], stage=3, block='d') - - x1 = conv_block(x1, 3, [256, 256, 1024], stage=4, block='a') - x1 = identity_block(x1, 3, [256, 256, 1024], stage=4, block='b') - x1 = identity_block(x1, 3, [256, 256, 1024], stage=4, block='c') - x1 = identity_block(x1, 3, [256, 256, 1024], stage=4, block='d') - x1 = identity_block(x1, 3, [256, 256, 1024], stage=4, block='e') - x1 = identity_block(x1, 3, [256, 256, 1024], stage=4, block='f') - - x1 = conv_block(x1, 3, [512, 512, 2048], stage=5, block='a') - x1 = identity_block(x1, 3, [512, 512, 2048], stage=5, block='b') - x1 = identity_block(x1, 3, [512, 512, 2048], stage=5, block='c') - - if pretraining: - Model(img_input , x1).load_weights(RESNET50_WEIGHTS_PATH) - - x1 = AveragePooling2D((7, 7), name='avg_pool1')(x1) - flattened = Flatten()(x1) + x = AveragePooling2D((7, 7), name='avg_pool1')(x) + flattened = Flatten()(x) o = Dense(256, activation='relu', name='fc512')(flattened) o=Dropout(0.2)(o) From daa084c3674f0ad66abd08bc9ad8b42634a3dcde Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 17 Feb 2026 17:11:15 +0100 Subject: [PATCH 82/91] training.models: re-use UNet decoder builder code --- src/eynollah/training/models.py | 297 +++++++++----------------------- 1 file changed, 85 insertions(+), 212 deletions(-) diff --git a/src/eynollah/training/models.py b/src/eynollah/training/models.py index 406e937..a03f028 100644 --- a/src/eynollah/training/models.py +++ b/src/eynollah/training/models.py @@ -198,67 +198,82 @@ def resnet50(inputs, weight_decay=1e-6, pretraining=False): return f1, f2, f3, f4, f5 +def unet_decoder(img, f1, f2, f3, f4, f5, n_classes, light=False, task="segmentation", weight_decay=1e-6): + if IMAGE_ORDERING == 'channels_last': + bn_axis = 3 + else: + bn_axis = 1 + + o = Conv2D(512 if light else 1024, (1, 1), padding='same', + data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay))(f5) + o = BatchNormalization(axis=bn_axis)(o) + o = Activation('relu')(o) + + if light: + f4 = Conv2D(512, (1, 1), padding='same', + data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay))(f4) + f4 = BatchNormalization(axis=bn_axis)(f4) + f4 = Activation('relu')(f4) + + o = UpSampling2D((2, 2), data_format=IMAGE_ORDERING)(o) + o = concatenate([o, f4], axis=MERGE_AXIS) + o = ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING)(o) + o = Conv2D(512, (3, 3), padding='valid', + data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay))(o) + o = BatchNormalization(axis=bn_axis)(o) + o = Activation('relu')(o) + + o = UpSampling2D((2, 2), data_format=IMAGE_ORDERING)(o) + o = concatenate([o, f3], axis=MERGE_AXIS) + o = ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING)(o) + o = Conv2D(256, (3, 3), padding='valid', + data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay))(o) + o = BatchNormalization(axis=bn_axis)(o) + o = Activation('relu')(o) + + o = UpSampling2D((2, 2), data_format=IMAGE_ORDERING)(o) + o = concatenate([o, f2], axis=MERGE_AXIS) + o = ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING)(o) + o = Conv2D(128, (3, 3), padding='valid', + data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay))(o) + o = BatchNormalization(axis=bn_axis)(o) + o = Activation('relu')(o) + + o = UpSampling2D((2, 2), data_format=IMAGE_ORDERING)(o) + o = concatenate([o, f1], axis=MERGE_AXIS) + o = ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING)(o) + o = Conv2D(64, (3, 3), padding='valid', + data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay))(o) + o = BatchNormalization(axis=bn_axis)(o) + o = Activation('relu')(o) + + o = UpSampling2D((2, 2), data_format=IMAGE_ORDERING)(o) + o = concatenate([o, img], axis=MERGE_AXIS) + o = ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING)(o) + o = Conv2D(32, (3, 3), padding='valid', + data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay))(o) + o = BatchNormalization(axis=bn_axis)(o) + o = Activation('relu')(o) + + o = Conv2D(n_classes, (1, 1), padding='same', + data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay))(o) + if task == "segmentation": + o = BatchNormalization(axis=bn_axis)(o) + o = Activation('softmax')(o) + else: + o = Activation('sigmoid')(o) + + return Model(img, o) + def resnet50_unet_light(n_classes, input_height=224, input_width=224, task="segmentation", weight_decay=1e-6, pretraining=False): assert input_height % 32 == 0 assert input_width % 32 == 0 img_input = Input(shape=(input_height, input_width, 3)) - f1, f2, f3, f4, f5 = resnet50(img_input, weight_decay, pretraining) - - v512_2048 = Conv2D(512, (1, 1), padding='same', data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay))(f5) - v512_2048 = (BatchNormalization(axis=bn_axis))(v512_2048) - v512_2048 = Activation('relu')(v512_2048) - - v512_1024 = Conv2D(512, (1, 1), padding='same', data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay))(f4) - v512_1024 = (BatchNormalization(axis=bn_axis))(v512_1024) - v512_1024 = Activation('relu')(v512_1024) - - o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(v512_2048) - o = (concatenate([o, v512_1024], axis=MERGE_AXIS)) - o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) - o = (Conv2D(512, (3, 3), padding='valid', data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay)))(o) - o = (BatchNormalization(axis=bn_axis))(o) - o = Activation('relu')(o) - - o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(o) - o = (concatenate([o, f3], axis=MERGE_AXIS)) - o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) - o = (Conv2D(256, (3, 3), padding='valid', data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay)))(o) - o = (BatchNormalization(axis=bn_axis))(o) - o = Activation('relu')(o) - - o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(o) - o = (concatenate([o, f2], axis=MERGE_AXIS)) - o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) - o = (Conv2D(128, (3, 3), padding='valid', data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay)))(o) - o = (BatchNormalization(axis=bn_axis))(o) - o = Activation('relu')(o) - - o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(o) - o = (concatenate([o, f1], axis=MERGE_AXIS)) - o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) - o = (Conv2D(64, (3, 3), padding='valid', data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay)))(o) - o = (BatchNormalization(axis=bn_axis))(o) - o = Activation('relu')(o) - - o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(o) - o = (concatenate([o, img_input], axis=MERGE_AXIS)) - o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) - o = (Conv2D(32, (3, 3), padding='valid', data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay)))(o) - o = (BatchNormalization(axis=bn_axis))(o) - o = Activation('relu')(o) - - o = Conv2D(n_classes, (1, 1), padding='same', data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay))(o) - if task == "segmentation": - o = (BatchNormalization(axis=bn_axis))(o) - o = (Activation('softmax'))(o) - else: - o = (Activation('sigmoid'))(o) - - model = Model(img_input, o) - return model + features = resnet50(img_input, weight_decay=weight_decay, pretraining=pretraining) + return unet_decoder(img_input, *features, n_classes, light=True, task=task, weight_decay=weight_decay) def resnet50_unet(n_classes, input_height=224, input_width=224, task="segmentation", weight_decay=1e-6, pretraining=False): assert input_height % 32 == 0 @@ -266,59 +281,9 @@ def resnet50_unet(n_classes, input_height=224, input_width=224, task="segmentati img_input = Input(shape=(input_height, input_width, 3)) - f1, f2, f3, f4, f5 = resnet50(img_input, weight_decay, pretraining) - - v1024_2048 = Conv2D(1024, (1, 1), padding='same', data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay))( - f5) - v1024_2048 = (BatchNormalization(axis=bn_axis))(v1024_2048) - v1024_2048 = Activation('relu')(v1024_2048) - - o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(v1024_2048) - o = (concatenate([o, f4], axis=MERGE_AXIS)) - o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) - o = (Conv2D(512, (3, 3), padding='valid', data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay)))(o) - o = (BatchNormalization(axis=bn_axis))(o) - o = Activation('relu')(o) - - o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(o) - o = (concatenate([o, f3], axis=MERGE_AXIS)) - o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) - o = (Conv2D(256, (3, 3), padding='valid', data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay)))(o) - o = (BatchNormalization(axis=bn_axis))(o) - o = Activation('relu')(o) - - o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(o) - o = (concatenate([o, f2], axis=MERGE_AXIS)) - o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) - o = (Conv2D(128, (3, 3), padding='valid', data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay)))(o) - o = (BatchNormalization(axis=bn_axis))(o) - o = Activation('relu')(o) - - o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(o) - o = (concatenate([o, f1], axis=MERGE_AXIS)) - o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) - o = (Conv2D(64, (3, 3), padding='valid', data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay)))(o) - o = (BatchNormalization(axis=bn_axis))(o) - o = Activation('relu')(o) - - o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(o) - o = (concatenate([o, img_input], axis=MERGE_AXIS)) - o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) - o = (Conv2D(32, (3, 3), padding='valid', data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay)))(o) - o = (BatchNormalization(axis=bn_axis))(o) - o = Activation('relu')(o) - - o = Conv2D(n_classes, (1, 1), padding='same', data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay))(o) - if task == "segmentation": - o = (BatchNormalization(axis=bn_axis))(o) - o = (Activation('softmax'))(o) - else: - o = (Activation('sigmoid'))(o) - - model = Model(img_input, o) - - return model + features = resnet50(img_input, weight_decay=weight_decay, pretraining=pretraining) + return unet_decoder(img_input, *features, n_classes, light=False, task=task, weight_decay=weight_decay) def vit_resnet50_unet(num_patches, n_classes, @@ -337,9 +302,9 @@ def vit_resnet50_unet(num_patches, transformer_mlp_head_units = [128, 64] inputs = Input(shape=(input_height, input_width, 3)) - f1, f2, f3, f4, f5 = resnet50(inputs, weight_decay, pretraining) + features = resnet50(inputs, weight_decay=weight_decay, pretraining=pretraining) - patches = Patches(transformer_patchsize_x, transformer_patchsize_y)(x) + patches = Patches(transformer_patchsize_x, transformer_patchsize_y)(features[-1]) # Encode patches. encoded_patches = PatchEncoder(num_patches, transformer_projection_dim)(patches) @@ -360,59 +325,16 @@ def vit_resnet50_unet(num_patches, encoded_patches = Add()([x3, x2]) encoded_patches = tf.reshape(encoded_patches, - [-1, x.shape[1], x.shape[2], + [-1, + features[-1].shape[1], + features[-1].shape[2], transformer_projection_dim // (transformer_patchsize_x * transformer_patchsize_y)]) + features[-1] = encoded_patches - v1024_2048 = Conv2D( 1024 , (1, 1), padding='same', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay))(encoded_patches) - v1024_2048 = (BatchNormalization(axis=bn_axis))(v1024_2048) - v1024_2048 = Activation('relu')(v1024_2048) - - o = (UpSampling2D( (2, 2), data_format=IMAGE_ORDERING))(v1024_2048) - o = (concatenate([o, f4],axis=MERGE_AXIS)) - o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) - o = (Conv2D(512, (3, 3), padding='valid', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay)))(o) - o = (BatchNormalization(axis=bn_axis))(o) - o = Activation('relu')(o) - - o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(o) - o = (concatenate([o ,f3], axis=MERGE_AXIS)) - o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) - o = (Conv2D(256, (3, 3), padding='valid', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay)))(o) - o = (BatchNormalization(axis=bn_axis))(o) - o = Activation('relu')(o) - - o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(o) - o = (concatenate([o, f2], axis=MERGE_AXIS)) - o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) - o = (Conv2D(128, (3, 3), padding='valid', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay)))(o) - o = (BatchNormalization(axis=bn_axis))(o) - o = Activation('relu')(o) - - o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(o) - o = (concatenate([o, f1], axis=MERGE_AXIS)) - o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) - o = (Conv2D(64, (3, 3), padding='valid', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay)))(o) - o = (BatchNormalization(axis=bn_axis))(o) - o = Activation('relu')(o) - - o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(o) - o = (concatenate([o, inputs],axis=MERGE_AXIS)) - o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) - o = (Conv2D(32, (3, 3), padding='valid', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay)))(o) - o = (BatchNormalization(axis=bn_axis))(o) - o = Activation('relu')(o) - - o = Conv2D(n_classes, (1, 1), padding='same', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay))(o) - if task == "segmentation": - o = (BatchNormalization(axis=bn_axis))(o) - o = (Activation('softmax'))(o) - else: - o = (Activation('sigmoid'))(o) + o = unet_decoder(inputs, *features, n_classes, task=task, weight_decay=weight_decay) - model = Model(inputs=inputs, outputs=o) - - return model + return Model(inputs, o) def vit_resnet50_unet_transformer_before_cnn(num_patches, n_classes, @@ -431,11 +353,6 @@ def vit_resnet50_unet_transformer_before_cnn(num_patches, transformer_mlp_head_units = [128, 64] inputs = Input(shape=(input_height, input_width, 3)) - if IMAGE_ORDERING == 'channels_last': - bn_axis = 3 - else: - bn_axis = 1 - patches = Patches(transformer_patchsize_x, transformer_patchsize_y)(inputs) # Encode patches. encoded_patches = PatchEncoder(num_patches, transformer_projection_dim)(patches) @@ -463,59 +380,15 @@ def vit_resnet50_unet_transformer_before_cnn(num_patches, transformer_projection_dim // (transformer_patchsize_x * transformer_patchsize_y)]) - encoded_patches = Conv2D(3, (1, 1), padding='same', data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay), name='convinput')(encoded_patches) + encoded_patches = Conv2D(3, (1, 1), padding='same', + data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay), + name='convinput')(encoded_patches) - f1, f2, f3, f4, f5 = resnet50(encoded_patches, weight_decay, pretraining) + features = resnet50(encoded_patches, weight_decay=weight_decay, pretraining=pretraining) - v1024_2048 = Conv2D( 1024 , (1, 1), padding='same', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay))(f5) - v1024_2048 = (BatchNormalization(axis=bn_axis))(v1024_2048) - v1024_2048 = Activation('relu')(v1024_2048) + o = unet_decoder(inputs, *features, n_classes, task=task, weight_decay=weight_decay) - o = (UpSampling2D( (2, 2), data_format=IMAGE_ORDERING))(v1024_2048) - o = (concatenate([o, f4],axis=MERGE_AXIS)) - o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) - o = (Conv2D(512, (3, 3), padding='valid', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay)))(o) - o = (BatchNormalization(axis=bn_axis))(o) - o = Activation('relu')(o) - - o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(o) - o = (concatenate([o ,f3], axis=MERGE_AXIS)) - o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) - o = (Conv2D(256, (3, 3), padding='valid', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay)))(o) - o = (BatchNormalization(axis=bn_axis))(o) - o = Activation('relu')(o) - - o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(o) - o = (concatenate([o, f2], axis=MERGE_AXIS)) - o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) - o = (Conv2D(128, (3, 3), padding='valid', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay)))(o) - o = (BatchNormalization(axis=bn_axis))(o) - o = Activation('relu')(o) - - o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(o) - o = (concatenate([o, f1], axis=MERGE_AXIS)) - o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) - o = (Conv2D(64, (3, 3), padding='valid', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay)))(o) - o = (BatchNormalization(axis=bn_axis))(o) - o = Activation('relu')(o) - - o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(o) - o = (concatenate([o, inputs],axis=MERGE_AXIS)) - o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) - o = (Conv2D(32, (3, 3), padding='valid', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay)))(o) - o = (BatchNormalization(axis=bn_axis))(o) - o = Activation('relu')(o) - - o = Conv2D(n_classes, (1, 1), padding='same', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay))(o) - if task == "segmentation": - o = (BatchNormalization(axis=bn_axis))(o) - o = (Activation('softmax'))(o) - else: - o = (Activation('sigmoid'))(o) - - model = Model(inputs=inputs, outputs=o) - - return model + return Model(inputs, o) def resnet50_classifier(n_classes,input_height=224,input_width=224,weight_decay=1e-6,pretraining=False): include_top=True From 9b66867c217ed17c8d8c30f45cbcc35824a2eb7a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 17 Feb 2026 17:35:20 +0100 Subject: [PATCH 83/91] training.models: re-use transformer builder code --- src/eynollah/training/models.py | 109 ++++++++++++++++---------------- 1 file changed, 53 insertions(+), 56 deletions(-) diff --git a/src/eynollah/training/models.py b/src/eynollah/training/models.py index a03f028..4af4949 100644 --- a/src/eynollah/training/models.py +++ b/src/eynollah/training/models.py @@ -285,6 +285,41 @@ def resnet50_unet(n_classes, input_height=224, input_width=224, task="segmentati return unet_decoder(img_input, *features, n_classes, light=False, task=task, weight_decay=weight_decay) +def transformer_block(img, + num_patches, + patchsize_x, + patchsize_y, + mlp_head_units, + n_layers, + num_heads, + projection_dim): + patches = Patches(patchsize_x, patchsize_y)(img) + # Encode patches. + encoded_patches = PatchEncoder(num_patches, projection_dim)(patches) + + for _ in range(n_layers): + # Layer normalization 1. + x1 = LayerNormalization(epsilon=1e-6)(encoded_patches) + # Create a multi-head attention layer. + attention_output = MultiHeadAttention(num_heads=num_heads, + key_dim=projection_dim, + dropout=0.1)(x1, x1) + # Skip connection 1. + x2 = Add()([attention_output, encoded_patches]) + # Layer normalization 2. + x3 = LayerNormalization(epsilon=1e-6)(x2) + # MLP. + x3 = mlp(x3, hidden_units=mlp_head_units, dropout_rate=0.1) + # Skip connection 2. + encoded_patches = Add()([x3, x2]) + + encoded_patches = tf.reshape(encoded_patches, + [-1, + img.shape[1], + img.shape[2], + projection_dim // (patchsize_x * patchsize_y)]) + return encoded_patches + def vit_resnet50_unet(num_patches, n_classes, transformer_patchsize_x, @@ -304,33 +339,14 @@ def vit_resnet50_unet(num_patches, features = resnet50(inputs, weight_decay=weight_decay, pretraining=pretraining) - patches = Patches(transformer_patchsize_x, transformer_patchsize_y)(features[-1]) - # Encode patches. - encoded_patches = PatchEncoder(num_patches, transformer_projection_dim)(patches) - - for _ in range(transformer_layers): - # Layer normalization 1. - x1 = LayerNormalization(epsilon=1e-6)(encoded_patches) - # Create a multi-head attention layer. - attention_output = MultiHeadAttention( - num_heads=transformer_num_heads, key_dim=transformer_projection_dim, dropout=0.1 - )(x1, x1) - # Skip connection 1. - x2 = Add()([attention_output, encoded_patches]) - # Layer normalization 2. - x3 = LayerNormalization(epsilon=1e-6)(x2) - # MLP. - x3 = mlp(x3, hidden_units=transformer_mlp_head_units, dropout_rate=0.1) - # Skip connection 2. - encoded_patches = Add()([x3, x2]) - - encoded_patches = tf.reshape(encoded_patches, - [-1, - features[-1].shape[1], - features[-1].shape[2], - transformer_projection_dim // (transformer_patchsize_x * - transformer_patchsize_y)]) - features[-1] = encoded_patches + features[-1] = transformer_block(features[-1], + num_patches, + transformer_patchsize_x, + transformer_patchsize_y, + transformer_mlp_head_units, + transformer_layers, + transformer_num_heads, + transformer_projection_dim) o = unet_decoder(inputs, *features, n_classes, task=task, weight_decay=weight_decay) @@ -352,38 +368,19 @@ def vit_resnet50_unet_transformer_before_cnn(num_patches, if transformer_mlp_head_units is None: transformer_mlp_head_units = [128, 64] inputs = Input(shape=(input_height, input_width, 3)) - - patches = Patches(transformer_patchsize_x, transformer_patchsize_y)(inputs) - # Encode patches. - encoded_patches = PatchEncoder(num_patches, transformer_projection_dim)(patches) - - for _ in range(transformer_layers): - # Layer normalization 1. - x1 = LayerNormalization(epsilon=1e-6)(encoded_patches) - # Create a multi-head attention layer. - attention_output = MultiHeadAttention( - num_heads=transformer_num_heads, key_dim=transformer_projection_dim, dropout=0.1 - )(x1, x1) - # Skip connection 1. - x2 = Add()([attention_output, encoded_patches]) - # Layer normalization 2. - x3 = LayerNormalization(epsilon=1e-6)(x2) - # MLP. - x3 = mlp(x3, hidden_units=transformer_mlp_head_units, dropout_rate=0.1) - # Skip connection 2. - encoded_patches = Add()([x3, x2]) - - encoded_patches = tf.reshape(encoded_patches, - [-1, - input_height, - input_width, - transformer_projection_dim // (transformer_patchsize_x * - transformer_patchsize_y)]) - + + encoded_patches = transformer_block(inputs, + num_patches, + transformer_patchsize_x, + transformer_patchsize_y, + transformer_mlp_head_units, + transformer_layers, + transformer_num_heads, + transformer_projection_dim) encoded_patches = Conv2D(3, (1, 1), padding='same', data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay), name='convinput')(encoded_patches) - + features = resnet50(encoded_patches, weight_decay=weight_decay, pretraining=pretraining) o = unet_decoder(inputs, *features, n_classes, task=task, weight_decay=weight_decay) From 7bef8fa95abc7a73ffa6648dd3ce936166818484 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 17 Feb 2026 17:24:07 +0100 Subject: [PATCH 84/91] training.train: add verbose=1 consistently --- src/eynollah/training/train.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index 05a7346..87b3551 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -471,6 +471,7 @@ def run(_config, model.fit( train_gen.prefetch(tf.data.AUTOTUNE), # .repeat()?? validation_data=val_gen.prefetch(tf.data.AUTOTUNE), + verbose=1, epochs=n_epochs, callbacks=callbacks, initial_epoch=index_start) @@ -544,6 +545,7 @@ def run(_config, model.fit( train_ds, #validation_data=test_ds, + verbose=1, epochs=n_epochs, callbacks=callbacks, initial_epoch=index_start) From c1b5cc92af60963a31965234bf44634dc24b7e7a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 17 Feb 2026 17:43:57 +0100 Subject: [PATCH 85/91] fix typo in 7562317d --- src/eynollah/training/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index 87b3551..fbbf920 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -360,7 +360,7 @@ def run(_config, if task == "enhancement": assert not is_loss_soft_dice, "for enhancement, soft_dice loss does not apply" - assert not weighted_dice, "for enhancement, weighted loss does not apply" + assert not weighted_loss, "for enhancement, weighted loss does not apply" if continue_training: custom_objects = dict() if is_loss_soft_dice: From 6a4163ae56f92c5182662da8f704e76577eb5bea Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 17 Feb 2026 17:48:43 +0100 Subject: [PATCH 86/91] fix typo in 27f43c17 --- src/eynollah/training/train.py | 2 +- src/eynollah/training/utils.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index fbbf920..f6117f7 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -523,7 +523,7 @@ def run(_config, ) train_ds = tf.data.Dataset.from_generator(gen) train_ds = train_ds.padded_batch(n_batch, - padded_shapes=([image_height, image_width, 3], [None]), + padded_shapes=([input_height, input_width, 3], [None]), padding_values=(0, padding_token), drop_remainder=True, #num_parallel_calls=tf.data.AUTOTUNE, diff --git a/src/eynollah/training/utils.py b/src/eynollah/training/utils.py index f2f4bdc..4b6033e 100644 --- a/src/eynollah/training/utils.py +++ b/src/eynollah/training/utils.py @@ -997,12 +997,12 @@ def preprocess_img(img, input_height, input_width) if padding_black: - yield from get_patches(do_padding_black(img), + yield from get_patches(do_padding_with_color(img, 'black'), do_padding_label(lab), input_height, input_width) if padding_white: - yield from get_patches(do_padding_white(img), + yield from get_patches(do_padding_with_color(img, 'white'), do_padding_label(lab), input_height, input_width) @@ -1129,7 +1129,7 @@ def preprocess_img_ocr( return scale_padd_image_for_ocr(img, input_height, input_width).astype(np.float32) / 255. #lab = vectorize_label(lab, char_to_num, padding_token, max_len) # now padded at Dataset.padded_batch - lab = char_to_num(tf.strings.unicode_split(label, input_encoding="UTF-8")) + lab = char_to_num(tf.strings.unicode_split(lab, input_encoding="UTF-8")) yield scale_image(img), lab #to_yield = {"image": ret_x, "label": ret_y} From 67fca82f384074028445f47fbcfdae44534668e1 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 17 Feb 2026 18:06:08 +0100 Subject: [PATCH 87/91] fix missing import in 27f43c17 --- src/eynollah/training/models.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/eynollah/training/models.py b/src/eynollah/training/models.py index 4af4949..ba61764 100644 --- a/src/eynollah/training/models.py +++ b/src/eynollah/training/models.py @@ -441,7 +441,7 @@ def cnn_rnn_ocr_model(image_height=None, image_width=None, n_classes=None, max_s x = Conv2D(64,kernel_size=(3,3),padding="same")(x) x = BatchNormalization(name="bn2")(x) x = Activation("relu", name="relu2")(x) - x = MaxPool2D(pool_size=(1,2),strides=(1,2))(x) + x = MaxPooling2D(pool_size=(1,2),strides=(1,2))(x) x = Conv2D(128,kernel_size=(3,3),padding="same")(x) x = BatchNormalization(name="bn3")(x) @@ -449,7 +449,7 @@ def cnn_rnn_ocr_model(image_height=None, image_width=None, n_classes=None, max_s x = Conv2D(128,kernel_size=(3,3),padding="same")(x) x = BatchNormalization(name="bn4")(x) x = Activation("relu", name="relu4")(x) - x = MaxPool2D(pool_size=(1,2),strides=(1,2))(x) + x = MaxPooling2D(pool_size=(1,2),strides=(1,2))(x) x = Conv2D(256,kernel_size=(3,3),padding="same")(x) x = BatchNormalization(name="bn5")(x) @@ -457,7 +457,7 @@ def cnn_rnn_ocr_model(image_height=None, image_width=None, n_classes=None, max_s x = Conv2D(256,kernel_size=(3,3),padding="same")(x) x = BatchNormalization(name="bn6")(x) x = Activation("relu", name="relu6")(x) - x = MaxPool2D(pool_size=(2,2),strides=(2,2))(x) + x = MaxPooling2D(pool_size=(2,2),strides=(2,2))(x) x = Conv2D(image_width,kernel_size=(3,3),padding="same")(x) x = BatchNormalization(name="bn7")(x) @@ -465,8 +465,8 @@ def cnn_rnn_ocr_model(image_height=None, image_width=None, n_classes=None, max_s x = Conv2D(image_width,kernel_size=(16,1))(x) x = BatchNormalization(name="bn8")(x) x = Activation("relu", name="relu8")(x) - x2d = MaxPool2D(pool_size=(1,2),strides=(1,2))(x) - x4d = MaxPool2D(pool_size=(1,2),strides=(1,2))(x2d) + x2d = MaxPooling2D(pool_size=(1,2),strides=(1,2))(x) + x4d = MaxPooling2D(pool_size=(1,2),strides=(1,2))(x2d) new_shape = (x.shape[1]*x.shape[2], x.shape[3]) From 5f713336495a6d392027637a241247d4dd355c79 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 17 Feb 2026 18:11:49 +0100 Subject: [PATCH 88/91] fix missing import in 49261fa9 --- src/eynollah/training/inference.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/eynollah/training/inference.py b/src/eynollah/training/inference.py index c38b79f..2be937d 100644 --- a/src/eynollah/training/inference.py +++ b/src/eynollah/training/inference.py @@ -17,6 +17,7 @@ import xml.etree.ElementTree as ET os.environ['TF_USE_LEGACY_KERAS'] = '1' # avoid Keras 3 after TF 2.15 import tensorflow as tf from tensorflow.keras.models import Model, load_model +from tensorflow.keras.layers import StringLookup from .gt_gen_utils import ( filter_contours_area_of_image, From f61effe8ce56e4dd4ebb2d9380b51946dfbac96a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 17 Feb 2026 18:20:58 +0100 Subject: [PATCH 89/91] fix typo in c8240905 --- src/eynollah/training/gt_gen_utils.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/eynollah/training/gt_gen_utils.py b/src/eynollah/training/gt_gen_utils.py index 8204a8e..d5ad4d9 100644 --- a/src/eynollah/training/gt_gen_utils.py +++ b/src/eynollah/training/gt_gen_utils.py @@ -238,12 +238,11 @@ def update_region_contours(co_text, img_boundary, erosion_rate, dilation_rate, y con_eroded = return_contours_of_interested_region(img_boundary_in,pixel, min_size ) try: - if len(con_eroded)>1: - cnt_size = np.array([cv2.contourArea(con_eroded[j]) for j in range(len(con_eroded))]) - cnt = contours[np.argmax(cnt_size)] - co_text_eroded.append(cnt) + if len(con_eroded) > 1: + largest = np.argmax(list(map(cv2.contourArea, con_eroded))) else: - co_text_eroded.append(con_eroded[0]) + largest = 0 + co_text_eroded.append(con_eroded[largest]) except: co_text_eroded.append(con) From 003c88f18ab513c3622bbc12f3a2bd44e75bd8f3 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 17 Feb 2026 18:23:32 +0100 Subject: [PATCH 90/91] fix double import in 82266f82 --- src/eynollah/cli/__init__.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/eynollah/cli/__init__.py b/src/eynollah/cli/__init__.py index 05dafa1..43ed046 100644 --- a/src/eynollah/cli/__init__.py +++ b/src/eynollah/cli/__init__.py @@ -2,14 +2,12 @@ # this must be the first import of the CLI! from ..eynollah_imports import imported_libs -from .cli_models import models_cli -from .cli_binarize import binarize_cli - from .cli import main from .cli_binarize import binarize_cli from .cli_enhance import enhance_cli from .cli_extract_images import extract_images_cli from .cli_layout import layout_cli +from .cli_models import models_cli from .cli_ocr import ocr_cli from .cli_readingorder import readingorder_cli From a9496bbc7079d11706e34d1fcef4a0269fe23117 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 17 Feb 2026 18:39:30 +0100 Subject: [PATCH 91/91] enhancer/mbreorder: use std Keras data loader for classification --- src/eynollah/image_enhancer.py | 6 ++++-- src/eynollah/mb_ro_on_layout.py | 4 +++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/eynollah/image_enhancer.py b/src/eynollah/image_enhancer.py index babbd55..67145a3 100644 --- a/src/eynollah/image_enhancer.py +++ b/src/eynollah/image_enhancer.py @@ -15,11 +15,13 @@ from pathlib import Path import gc import cv2 -from keras.models import Model import numpy as np -import tensorflow as tf # type: ignore from skimage.morphology import skeletonize +os.environ['TF_USE_LEGACY_KERAS'] = '1' # avoid Keras 3 after TF 2.15 +import tensorflow as tf # type: ignore +from tensorflow.keras.models import Model + from .model_zoo import EynollahModelZoo from .utils.resize import resize_image from .utils.pil_cv2 import pil2cv diff --git a/src/eynollah/mb_ro_on_layout.py b/src/eynollah/mb_ro_on_layout.py index eec544c..22fe97b 100644 --- a/src/eynollah/mb_ro_on_layout.py +++ b/src/eynollah/mb_ro_on_layout.py @@ -14,10 +14,12 @@ from pathlib import Path import xml.etree.ElementTree as ET import cv2 -from keras.models import Model import numpy as np import statistics + +os.environ['TF_USE_LEGACY_KERAS'] = '1' # avoid Keras 3 after TF 2.15 import tensorflow as tf +from tensorflow.keras.models import Model from .model_zoo import EynollahModelZoo from .utils.resize import resize_image