From 086c1880ac600e8d4b043fc8206298e9e964081d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 15 Oct 2025 12:24:21 +0200 Subject: [PATCH 001/118] binarization: add option `--overwrite`, skip existing outputs (also, simplify `run` and separate `run_single`) --- src/eynollah/cli.py | 16 ++++-- src/eynollah/sbb_binarize.py | 96 +++++++++++++++--------------------- 2 files changed, 52 insertions(+), 60 deletions(-) diff --git a/src/eynollah/cli.py b/src/eynollah/cli.py index c9bad52..e4a24e4 100644 --- a/src/eynollah/cli.py +++ b/src/eynollah/cli.py @@ -79,18 +79,28 @@ def machine_based_reading_order(input, dir_in, out, model, log_level): type=click.Path(file_okay=True, dir_okay=True), required=True, ) +@click.option( + "--overwrite", + "-O", + help="overwrite (instead of skipping) if output xml exists", + is_flag=True, +) @click.option( "--log_level", "-l", type=click.Choice(['OFF', 'DEBUG', 'INFO', 'WARN', 'ERROR']), help="Override log level globally to this", ) -def binarization(patches, model_dir, input_image, dir_in, output, log_level): +def binarization(patches, model_dir, input_image, dir_in, output, overwrite, log_level): assert bool(input_image) != bool(dir_in), "Either -i (single input) or -di (directory) must be provided, but not both." binarizer = SbbBinarizer(model_dir) if log_level: - binarizer.log.setLevel(getLevelName(log_level)) - binarizer.run(image_path=input_image, use_patches=patches, output=output, dir_in=dir_in) + binarizer.logger.setLevel(getLevelName(log_level)) + binarizer.run(overwrite=overwrite, + use_patches=patches, + image_path=input_image, + output=output, + dir_in=dir_in) @main.command() diff --git a/src/eynollah/sbb_binarize.py b/src/eynollah/sbb_binarize.py index 3716987..0eab2ae 100644 --- a/src/eynollah/sbb_binarize.py +++ b/src/eynollah/sbb_binarize.py @@ -25,7 +25,7 @@ class SbbBinarizer: def __init__(self, model_dir, logger=None): self.model_dir = model_dir - self.log = logger if logger else logging.getLogger('SbbBinarizer') + self.logger = logger if logger else logging.getLogger('SbbBinarizer') self.start_new_session() @@ -315,64 +315,46 @@ class SbbBinarizer: prediction_true = prediction_true.astype(np.uint8) return prediction_true[:,:,0] - def run(self, image=None, image_path=None, output=None, use_patches=False, dir_in=None): - # print(dir_in,'dir_in') - if not dir_in: - if (image is not None and image_path is not None) or \ - (image is None and image_path is None): - raise ValueError("Must pass either a opencv2 image or an image_path") - if image_path is not None: - image = cv2.imread(image_path) - img_last = 0 - for n, (model, model_file) in enumerate(zip(self.models, self.model_files)): - self.log.info('Predicting with model %s [%s/%s]' % (model_file, n + 1, len(self.model_files))) - - res = self.predict(model, image, use_patches) - - img_fin = np.zeros((res.shape[0], res.shape[1], 3)) - res[:, :][res[:, :] == 0] = 2 - res = res - 1 - res = res * 255 - img_fin[:, :, 0] = res - img_fin[:, :, 1] = res - img_fin[:, :, 2] = res - - img_fin = img_fin.astype(np.uint8) - img_fin = (res[:, :] == 0) * 255 - img_last = img_last + img_fin - - kernel = np.ones((5, 5), np.uint8) - img_last[:, :][img_last[:, :] > 0] = 255 - img_last = (img_last[:, :] == 0) * 255 - if output: - cv2.imwrite(output, img_last) - return img_last + def run(self, image_path=None, output=None, dir_in=None, use_patches=False, overwrite=False): + if dir_in: + ls_imgs = [(os.path.join(dir_in, image_filename), + os.path.join(output, os.path.splitext(image_filename)[0] + '.png')) + for image_filename in filter(is_image_filename, + os.listdir(dir_in))] else: - ls_imgs = list(filter(is_image_filename, os.listdir(dir_in))) - for image_name in ls_imgs: - image_stem = image_name.split('.')[0] - print(image_name,'image_name') - image = cv2.imread(os.path.join(dir_in,image_name) ) - img_last = 0 - for n, (model, model_file) in enumerate(zip(self.models, self.model_files)): - self.log.info('Predicting with model %s [%s/%s]' % (model_file, n + 1, len(self.model_files))) + ls_imgs = [(image_path, output)] - res = self.predict(model, image, use_patches) + for input_path, output_path in ls_imgs: + print(input_path, 'image_name') + if os.path.exists(output_path): + if overwrite: + self.logger.warning("will overwrite existing output file '%s'", output_ptah) + else: + self.logger.warning("will skip input for existing output file '%s'", output_path) + image = cv2.imread(input_path) + result = self.run_single(image, use_patches) + cv2.imwrite(output_path, result) - img_fin = np.zeros((res.shape[0], res.shape[1], 3)) - res[:, :][res[:, :] == 0] = 2 - res = res - 1 - res = res * 255 - img_fin[:, :, 0] = res - img_fin[:, :, 1] = res - img_fin[:, :, 2] = res + def run_single(self, image: np.ndarray, use_patches=False): + img_last = 0 + for n, (model, model_file) in enumerate(zip(self.models, self.model_files)): + self.logger.info('Predicting with model %s [%s/%s]' % (model_file, n + 1, len(self.model_files))) - img_fin = img_fin.astype(np.uint8) - img_fin = (res[:, :] == 0) * 255 - img_last = img_last + img_fin + res = self.predict(model, image, use_patches) - kernel = np.ones((5, 5), np.uint8) - img_last[:, :][img_last[:, :] > 0] = 255 - img_last = (img_last[:, :] == 0) * 255 - - cv2.imwrite(os.path.join(output, image_stem + '.png'), img_last) + img_fin = np.zeros((res.shape[0], res.shape[1], 3)) + res[:, :][res[:, :] == 0] = 2 + res = res - 1 + res = res * 255 + img_fin[:, :, 0] = res + img_fin[:, :, 1] = res + img_fin[:, :, 2] = res + + img_fin = img_fin.astype(np.uint8) + img_fin = (res[:, :] == 0) * 255 + img_last = img_last + img_fin + + kernel = np.ones((5, 5), np.uint8) + img_last[:, :][img_last[:, :] > 0] = 255 + img_last = (img_last[:, :] == 0) * 255 + return img_last From 184927fb5488f440948320ca97d716144da5012c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 20 Oct 2025 13:16:57 +0200 Subject: [PATCH 002/118] `find_num_cols`: re-sort peaks when cutting n-best `num_col_classifier` --- src/eynollah/utils/__init__.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 5ccb2af..7c47407 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -463,22 +463,19 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl interest_neg_fin = interest_neg[(interest_neg < grenze)] peaks_neg_fin = peaks_neg[(interest_neg < grenze)] - # interest_neg_fin=interest_neg[(interest_neg= 3: - index_sort_interest_neg_fin= np.argsort(interest_neg_fin) - peaks_neg_sorted = np.array(peaks_neg)[index_sort_interest_neg_fin] - interest_neg_fin_sorted = np.array(interest_neg_fin)[index_sort_interest_neg_fin] + # found too few columns here: ignore 'grenze' and take the deepest N peaks + sort_by_height = np.argsort(interest_neg)[:num_col_classifier] + peaks_neg_fin = peaks_neg[sort_by_height] + interest_neg_fin = interest_neg[sort_by_height] + # print(peaks_neg_fin, "peaks_neg[sorted_by_height]") + sort_by_pos = np.argsort(peaks_neg_fin) + peaks_neg_fin = peaks_neg_fin[sort_by_pos] + interest_neg_fin = interest_neg_fin[sort_by_pos] - if len(index_sort_interest_neg_fin)>=num_col_classifier: - peaks_neg_fin = list( peaks_neg_sorted[:num_col_classifier] ) - interest_neg_fin = list( interest_neg_fin_sorted[:num_col_classifier] ) - else: - peaks_neg_fin = peaks_neg[:] - interest_neg_fin = interest_neg[:] - - num_col = (len(interest_neg_fin)) + 1 + num_col = len(interest_neg_fin) + 1 # print(peaks_neg_fin,'peaks_neg_fin') # print(num_col,'diz') From 48761c3e127bfde488cc3ff6dd7edc97eb85bfd0 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 20 Oct 2025 13:20:12 +0200 Subject: [PATCH 003/118] `find_num_col`: simplify, add better plotting (but commented out) --- src/eynollah/utils/__init__.py | 208 +++++++++++++++++---------------- 1 file changed, 108 insertions(+), 100 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 7c47407..ce72df4 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -396,16 +396,18 @@ def find_num_col_deskew(regions_without_separators, sigma_, multiplier=3.8): def find_num_col(regions_without_separators, num_col_classifier, tables, multiplier=3.8): if not regions_without_separators.any(): return 0, [] - #plt.imshow(regions_without_separators) - #plt.show() regions_without_separators_0 = regions_without_separators.sum(axis=0) - ##plt.plot(regions_without_separators_0) - ##plt.show() + # fig, (ax1, ax2) = plt.subplots(2, sharex=True) + # ax1.imshow(regions_without_separators, aspect="auto") + # ax2.plot(regions_without_separators_0) + # plt.show() sigma_ = 35 # 70#35 - meda_n_updown = regions_without_separators_0[len(regions_without_separators_0) :: -1] + meda_n_updown = regions_without_separators_0[::-1] first_nonzero = next((i for i, x in enumerate(regions_without_separators_0) if x), 0) last_nonzero = next((i for i, x in enumerate(meda_n_updown) if x), 0) last_nonzero = len(regions_without_separators_0) - last_nonzero + last_nonzero = last_nonzero - 100 + first_nonzero = first_nonzero + 200 y = regions_without_separators_0 # [first_nonzero:last_nonzero] y_help = np.zeros(len(y) + 20) y_help[10 : len(y) + 10] = y @@ -416,28 +418,44 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl z = gaussian_filter1d(y, sigma_) zneg = gaussian_filter1d(zneg, sigma_) - peaks_neg, _ = find_peaks(zneg, height=0) - #plt.plot(zneg) - #plt.plot(peaks_neg, zneg[peaks_neg], 'rx') - #plt.show() peaks, _ = find_peaks(z, height=0) + peaks_neg, _ = find_peaks(zneg, height=0) + # _, (ax1, ax2) = plt.subplots(2, sharex=True) + # ax1.set_title("z") + # ax1.plot(z) + # ax1.scatter(peaks, z[peaks]) + # ax1.axvline(0.06 * len(y), label="first") + # ax1.axvline(0.94 * len(y), label="last") + # ax1.text(0.06 * len(y), 0, "first", rotation=90) + # ax1.text(0.94 * len(y), 0, "last", rotation=90) + # ax1.axhline(10, label="minimum") + # ax1.text(0, 10, "minimum") + # ax2.set_title("zneg") + # ax2.plot(zneg) + # ax2.scatter(peaks_neg, zneg[peaks_neg]) + # ax2.axvline(first_nonzero, label="first nonzero") + # ax2.axvline(last_nonzero, label="last nonzero") + # ax2.text(first_nonzero, 0, "first nonzero", rotation=90) + # ax2.text(last_nonzero, 0, "last nonzero", rotation=90) + # ax2.axvline(370, label="first") + # ax2.axvline(len(y) - 370, label="last") + # ax2.text(370, 0, "first", rotation=90) + # ax2.text(len(y) - 370, 0, "last", rotation=90) + # plt.show() peaks_neg = peaks_neg - 10 - 10 - last_nonzero = last_nonzero - 100 - first_nonzero = first_nonzero + 200 - - peaks_neg = peaks_neg[(peaks_neg > first_nonzero) & - (peaks_neg < last_nonzero)] - peaks = peaks[(peaks > 0.06 * regions_without_separators.shape[1]) & - (peaks < 0.94 * regions_without_separators.shape[1])] - peaks_neg = peaks_neg[(peaks_neg > 370) & - (peaks_neg < (regions_without_separators.shape[1] - 370))] + peaks = peaks[(peaks > 0.06 * len(y)) & + (peaks < 0.94 * len(y))] interest_pos = z[peaks] interest_pos = interest_pos[interest_pos > 10] if not interest_pos.any(): return 0, [] # plt.plot(z) # plt.show() + peaks_neg = peaks_neg[(peaks_neg > first_nonzero) & + (peaks_neg < last_nonzero)] + peaks_neg = peaks_neg[(peaks_neg > 370) & + (peaks_neg < len(y) - 370)] interest_neg = z[peaks_neg] if not interest_neg.any(): return 0, [] @@ -445,21 +463,28 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl min_peaks_pos = np.min(interest_pos) max_peaks_pos = np.max(interest_pos) - if max_peaks_pos / min_peaks_pos >= 35: + #print(min_peaks_pos, max_peaks_pos, max_peaks_pos / min_peaks_pos, 'minmax') + if max_peaks_pos / (min_peaks_pos or 1e-9) >= 35: min_peaks_pos = np.mean(interest_pos) min_peaks_neg = 0 # np.min(interest_neg) - # print(np.min(interest_pos),np.max(interest_pos),np.max(interest_pos)/np.min(interest_pos),'minmax') dis_talaei = (min_peaks_pos - min_peaks_neg) / multiplier grenze = min_peaks_pos - dis_talaei - # np.mean(y[peaks_neg[0]:peaks_neg[len(peaks_neg)-1]])-np.std(y[peaks_neg[0]:peaks_neg[len(peaks_neg)-1]])/2.0 + #np.mean(y[peaks_neg[0]:peaks_neg[-1]])-np.std(y[peaks_neg[0]:peaks_neg[-1]])/2.0 # print(interest_neg,'interest_neg') # print(grenze,'grenze') # print(min_peaks_pos,'min_peaks_pos') # print(dis_talaei,'dis_talaei') # print(peaks_neg,'peaks_neg') + # fig, (ax1, ax2) = plt.subplots(2, sharex=True) + # ax1.imshow(regions_without_separators, aspect="auto") + # ax2.plot(z) + # ax2.scatter(peaks_neg, z[peaks_neg]) + # ax2.axhline(grenze, label="grenze") + # ax2.text(0, grenze, "grenze") + # plt.show() interest_neg_fin = interest_neg[(interest_neg < grenze)] peaks_neg_fin = peaks_neg[(interest_neg < grenze)] @@ -479,46 +504,38 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl # print(peaks_neg_fin,'peaks_neg_fin') # print(num_col,'diz') - p_l = 0 - p_u = len(y) - 1 - p_m = int(len(y) / 2.0) - p_g_l = int(len(y) / 4.0) - p_g_u = len(y) - int(len(y) / 4.0) - - if num_col == 3: - if ((peaks_neg_fin[0] > p_g_u and - peaks_neg_fin[1] > p_g_u) or - (peaks_neg_fin[0] < p_g_l and - peaks_neg_fin[1] < p_g_l) or - (peaks_neg_fin[0] + 200 < p_m and - peaks_neg_fin[1] < p_m) or - (peaks_neg_fin[0] - 200 > p_m and - peaks_neg_fin[1] > p_m)): - num_col = 1 - peaks_neg_fin = [] - - if num_col == 2: - if (peaks_neg_fin[0] > p_g_u or - peaks_neg_fin[0] < p_g_l): - num_col = 1 - peaks_neg_fin = [] + # cancel if resulting split is highly unbalanced across available width + if ((num_col == 3 and + ((peaks_neg_fin[0] > 0.75 * len(y) and + peaks_neg_fin[1] > 0.75 * len(y)) or + (peaks_neg_fin[0] < 0.25 * len(y) and + peaks_neg_fin[1] < 0.25 * len(y)) or + (peaks_neg_fin[0] < 0.5 * len(y) - 200 and + peaks_neg_fin[1] < 0.5 * len(y)) or + (peaks_neg_fin[0] > 0.5 * len(y) + 200 and + peaks_neg_fin[1] > 0.5 * len(y)))) or + (num_col == 2 and + (peaks_neg_fin[0] > 0.75 * len(y) or + peaks_neg_fin[0] < 0.25 * len(y)))): + num_col = 1 + peaks_neg_fin = [] ##print(len(peaks_neg_fin)) + # filter out peaks that are too close (<400px) to each other: + # among each group, pick the position with smallest amount of text diff_peaks = np.abs(np.diff(peaks_neg_fin)) cut_off = 400 peaks_neg_true = [] forest = [] - # print(len(peaks_neg_fin),'len_') - for i in range(len(peaks_neg_fin)): if i == 0: forest.append(peaks_neg_fin[i]) if i < len(peaks_neg_fin) - 1: if diff_peaks[i] <= cut_off: forest.append(peaks_neg_fin[i + 1]) - if diff_peaks[i] > cut_off: + else: # print(forest[np.argmin(z[forest]) ] ) if not isNaN(forest[np.argmin(z[forest])]): peaks_neg_true.append(forest[np.argmin(z[forest])]) @@ -530,68 +547,59 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl peaks_neg_true.append(forest[np.argmin(z[forest])]) num_col = len(peaks_neg_true) + 1 - p_l = 0 - p_u = len(y) - 1 - p_m = int(len(y) / 2.0) - p_quarter = int(len(y) / 5.0) - p_g_l = int(len(y) / 4.0) - p_g_u = len(y) - int(len(y) / 4.0) - - p_u_quarter = len(y) - p_quarter - + #print(peaks_neg_true, "peaks_neg_true") ##print(num_col,'early') - if num_col == 3: - if ((peaks_neg_true[0] > p_g_u and - peaks_neg_true[1] > p_g_u) or - (peaks_neg_true[0] < p_g_l and - peaks_neg_true[1] < p_g_l) or - (peaks_neg_true[0] < p_m and - peaks_neg_true[1] + 200 < p_m) or - (peaks_neg_true[0] - 200 > p_m and - peaks_neg_true[1] > p_m)): - num_col = 1 - peaks_neg_true = [] - elif (peaks_neg_true[0] < p_g_u and - peaks_neg_true[0] > p_g_l and - peaks_neg_true[1] > p_u_quarter): - peaks_neg_true = [peaks_neg_true[0]] - elif (peaks_neg_true[1] < p_g_u and - peaks_neg_true[1] > p_g_l and - peaks_neg_true[0] < p_quarter): - peaks_neg_true = [peaks_neg_true[1]] + # cancel if resulting split is highly unbalanced across available width + if ((num_col == 3 and + ((peaks_neg_true[0] > 0.75 * len(y) and + peaks_neg_true[1] > 0.75 * len(y)) or + (peaks_neg_true[0] < 0.25 * len(y) and + peaks_neg_true[1] < 0.25 * len(y)) or + (peaks_neg_true[0] < 0.5 * len(y) - 200 and + peaks_neg_true[1] < 0.5 * len(y)) or + (peaks_neg_true[0] > 0.5 * len(y) + 200 and + peaks_neg_true[1] > 0.5 * len(y)))) or + (num_col == 2 and + (peaks_neg_true[0] > 0.75 * len(y) or + peaks_neg_true[0] < 0.25 * len(y)))): + num_col = 1 + peaks_neg_true = [] + if (num_col == 3 and + (peaks_neg_true[0] < 0.75 * len(y) and + peaks_neg_true[0] > 0.25 * len(y) and + peaks_neg_true[1] > 0.80 * len(y))): + num_col = 2 + peaks_neg_true = [peaks_neg_true[0]] + if (num_col == 3 and + (peaks_neg_true[1] < 0.75 * len(y) and + peaks_neg_true[1] > 0.25 * len(y) and + peaks_neg_true[0] < 0.20 * len(y))): + num_col = 2 + peaks_neg_true = [peaks_neg_true[1]] - if num_col == 2: - if (peaks_neg_true[0] > p_g_u or - peaks_neg_true[0] < p_g_l): - num_col = 1 - peaks_neg_true = [] + # get rid of too narrow columns (not used) + # if np.count_nonzero(diff_peaks < 360): + # arg_help = np.arange(len(diff_peaks)) + # arg_help_ann = arg_help[diff_peaks < 360] + # peaks_neg_fin_new = [] + # for ii in range(len(peaks_neg_fin)): + # if ii in arg_help_ann: + # if interest_neg_fin[ii] < interest_neg_fin[ii + 1]: + # peaks_neg_fin_new.append(peaks_neg_fin[ii]) + # else: + # peaks_neg_fin_new.append(peaks_neg_fin[ii + 1]) - diff_peaks_abnormal = diff_peaks[diff_peaks < 360] - - if len(diff_peaks_abnormal) > 0: - arg_help = np.arange(len(diff_peaks)) - arg_help_ann = arg_help[diff_peaks < 360] - - peaks_neg_fin_new = [] - - for ii in range(len(peaks_neg_fin)): - if ii in arg_help_ann: - arg_min = np.argmin([interest_neg_fin[ii], interest_neg_fin[ii + 1]]) - if arg_min == 0: - peaks_neg_fin_new.append(peaks_neg_fin[ii]) - else: - peaks_neg_fin_new.append(peaks_neg_fin[ii + 1]) - - elif (ii - 1) not in arg_help_ann: - peaks_neg_fin_new.append(peaks_neg_fin[ii]) - else: - peaks_neg_fin_new = peaks_neg_fin + # elif (ii - 1) not in arg_help_ann: + # peaks_neg_fin_new.append(peaks_neg_fin[ii]) + # else: + # peaks_neg_fin_new = peaks_neg_fin # plt.plot(gaussian_filter1d(y, sigma_)) # plt.plot(peaks_neg_true,z[peaks_neg_true],'*') # plt.plot([0,len(y)], [grenze,grenze]) # plt.show() ##print(len(peaks_neg_true)) + #print(peaks_neg_true, "peaks_neg_true") return len(peaks_neg_true), peaks_neg_true def find_num_col_only_image(regions_without_separators, multiplier=3.8): From c43a825d1d26c36beee3bbc2e038f8c0cda4221b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 20 Oct 2025 13:26:01 +0200 Subject: [PATCH 004/118] `order_of_regions`: filter out-of-image peaks --- src/eynollah/utils/__init__.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index ce72df4..677ed53 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1216,15 +1216,16 @@ def order_of_regions(textline_mask, contours_main, contours_head, y_ref): peaks_neg, _ = find_peaks(zneg, height=0) peaks_neg = peaks_neg - 20 - 20 - ##plt.plot(z) - ##plt.show() - cx_main, cy_main = find_center_of_contours(contours_main) - cx_head, cy_head = find_center_of_contours(contours_head) - - peaks_neg_new = np.append(np.insert(peaks_neg, 0, 0), textline_mask.shape[0]) + peaks_neg_new = np.array([0] + + # peaks can be beyond box due to padding and smoothing + [peak for peak in peaks_neg + if 0 < peak and peak < textline_mask.shape[0]] + + [textline_mask.shape[0]]) # offset from bbox of mask peaks_neg_new += y_ref + cx_main, cy_main = find_center_of_contours(contours_main) + cx_head, cy_head = find_center_of_contours(contours_head) # assert not len(cy_main) or np.min(peaks_neg_new) <= np.min(cy_main) and np.max(cy_main) <= np.max(peaks_neg_new) # assert not len(cy_head) or np.min(peaks_neg_new) <= np.min(cy_head) and np.max(cy_head) <= np.max(peaks_neg_new) From d3d599b0108bf17802bda2f9808620e3cd8471db Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 20 Oct 2025 13:27:23 +0200 Subject: [PATCH 005/118] `order_of_regions`: add better plotting (but commented out) --- src/eynollah/eynollah.py | 2 +- src/eynollah/utils/__init__.py | 20 +++++++++++++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 13acba6..9412861 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -2553,7 +2553,7 @@ class Eynollah: con_inter_box_h = contours_only_text_parent_h[args_contours_box_head] indexes_sorted, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( - textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, box[2]) + textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, box[2], box[0]) order_of_texts, id_of_texts = order_and_id_of_texts( con_inter_box, con_inter_box_h, diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 677ed53..f2e3581 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1197,7 +1197,7 @@ def small_textlines_to_parent_adherence2(textlines_con, textline_iamge, num_col) textlines_con_changed.append(textlines_big_org_form) return textlines_con_changed -def order_of_regions(textline_mask, contours_main, contours_head, y_ref): +def order_of_regions(textline_mask, contours_main, contours_head, y_ref, x_ref): ##plt.imshow(textline_mask) ##plt.show() y = textline_mask.sum(axis=1) # horizontal projection profile @@ -1208,6 +1208,8 @@ def order_of_regions(textline_mask, contours_main, contours_head, y_ref): #z = gaussian_filter1d(y_padded, sigma_gaus) #peaks, _ = find_peaks(z, height=0) #peaks = peaks - 20 + ##plt.plot(z) + ##plt.show() zneg_rev = np.max(y_padded) - y_padded zneg = np.zeros(len(zneg_rev) + 40) zneg[20 : len(zneg_rev) + 20] = zneg_rev @@ -1250,6 +1252,22 @@ def order_of_regions(textline_mask, contours_main, contours_head, y_ref): indexes_in, types_in, cxs_in, cys_in, typed_indexes_in = \ matrix_of_orders[(matrix_of_orders[:, 3] >= top) & (matrix_of_orders[:, 3] < bot)].T + # if indexes_in.size: + # img = textline_mask.copy() + # plt.imshow(img) + # plt.gca().add_patch(patches.Rectangle((0, top-y_ref), img.shape[1], bot-top, alpha=0.5, color='gray')) + # xrange = np.arange(0, img.shape[1], 50) + # yrange = np.arange(0, img.shape[0], 50) + # plt.gca().set_xticks(xrange, xrange + x_ref) + # plt.gca().set_yticks(yrange, yrange + y_ref) + # for idx, type_, cx, cy in zip(typed_indexes_in, types_in, cxs_in, cys_in): + # cnt = (contours_main if type_ == 1 else contours_head)[idx] + # col = 'red' if type_ == 1 else 'blue' + # plt.scatter(cx - x_ref, cy - y_ref, 20, c=col, marker='o') + # plt.gca().add_patch(patches.Polygon(cnt[:, 0] - [[x_ref, y_ref]], closed=False, fill=False, color=col)) + # plt.title("box contours centered in %d:%d (red=main / blue=heading)" % (top, bot)) + # plt.show() + sorted_inside = np.argsort(cxs_in) final_indexers_sorted.extend(indexes_in[sorted_inside]) final_types.extend(types_in[sorted_inside]) From 542d38ab432e3089ebc8fefd3caee2915fe6b031 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 20 Oct 2025 13:34:56 +0200 Subject: [PATCH 006/118] =?UTF-8?q?`find=5Fnumber=5Fof=5Fcolumns=5Fin=5Fdo?= =?UTF-8?q?cument`:=20simplify,=20rename=20`line`=E2=86=92`seps`?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/eynollah/utils/__init__.py | 244 +++++++++++++++------------------ 1 file changed, 109 insertions(+), 135 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index f2e3581..168899f 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1377,175 +1377,149 @@ def return_points_with_boundies(peaks_neg_fin, first_point, last_point): peaks_neg_tot.append(last_point) return peaks_neg_tot -def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, label_lines, contours_h=None): - t_ins_c0 = time.time() - separators_closeup=( (region_pre_p[:,:]==label_lines))*1 - separators_closeup[0:110,:]=0 - separators_closeup[separators_closeup.shape[0]-150:,:]=0 +def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, label_seps, contours_h=None): + separators_closeup = 1 * (region_pre_p == label_seps) + separators_closeup[0:110] = 0 + separators_closeup[-150:] = 0 kernel = np.ones((5,5),np.uint8) - separators_closeup=separators_closeup.astype(np.uint8) - separators_closeup = cv2.dilate(separators_closeup,kernel,iterations = 1) - separators_closeup = cv2.erode(separators_closeup,kernel,iterations = 1) + separators_closeup = separators_closeup.astype(np.uint8) + separators_closeup = cv2.morphologyEx(separators_closeup, cv2.MORPH_CLOSE, kernel, iterations=1) - separators_closeup_new=np.zeros((separators_closeup.shape[0] ,separators_closeup.shape[1] )) - separators_closeup_n=np.copy(separators_closeup) - separators_closeup_n=separators_closeup_n.astype(np.uint8) + separators_closeup_n = separators_closeup.astype(np.uint8) # to be returned - separators_closeup_n_binary=np.zeros(( separators_closeup_n.shape[0],separators_closeup_n.shape[1]) ) - separators_closeup_n_binary[:,:]=separators_closeup_n[:,:] - separators_closeup_n_binary[:,:][separators_closeup_n_binary[:,:]!=0]=1 + separators_closeup_n_binary = separators_closeup_n.copy() - _, thresh_e = cv2.threshold(separators_closeup_n_binary, 0, 255, 0) - contours_line_e, _ = cv2.findContours(thresh_e.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - _, dist_xe, _, _, _, _, y_min_main, y_max_main, _ = \ - find_features_of_lines(contours_line_e) - dist_ye = y_max_main - y_min_main - args_e=np.arange(len(contours_line_e)) - args_hor_e=args_e[(dist_ye<=50) & - (dist_xe>=3*dist_ye)] - cnts_hor_e=[] - for ce in args_hor_e: - cnts_hor_e.append(contours_line_e[ce]) + # find horizontal lines by contour properties + contours_sep_e, _ = cv2.findContours(separators_closeup_n_binary, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + cnts_hor_e = [] + for cnt in contours_sep_e: + max_xe = cnt[:, 0, 0].max() + min_xe = cnt[:, 0, 0].min() + max_ye = cnt[:, 0, 1].max() + min_ye = cnt[:, 0, 1].min() + dist_xe = max_xe - min_xe + dist_ye = max_ye - min_ye + if dist_ye <= 50 and dist_xe >= 3 * dist_ye: + cnts_hor_e.append(cnt) - separators_closeup_n_binary=cv2.fillPoly(separators_closeup_n_binary, pts=cnts_hor_e, color=0) - gray = cv2.bitwise_not(separators_closeup_n_binary) - gray=gray.astype(np.uint8) + # delete horizontal contours (leaving only the edges) + separators_closeup_n_binary = cv2.fillPoly(separators_closeup_n_binary, pts=cnts_hor_e, color=0) + edges = cv2.adaptiveThreshold(separators_closeup_n_binary * 255, 255, + cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 15, -2) + horizontal = np.copy(edges) + vertical = np.copy(edges) - bw = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C, \ - cv2.THRESH_BINARY, 15, -2) - horizontal = np.copy(bw) - vertical = np.copy(bw) - - cols = horizontal.shape[1] - horizontal_size = cols // 30 - # Create structure element for extracting horizontal lines through morphology operations + horizontal_size = horizontal.shape[1] // 30 + # find horizontal lines by morphology horizontalStructure = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontal_size, 1)) - # Apply morphology operations - horizontal = cv2.erode(horizontal, horizontalStructure) - horizontal = cv2.dilate(horizontal, horizontalStructure) - - kernel = np.ones((5,5),np.uint8) - horizontal = cv2.dilate(horizontal,kernel,iterations = 2) - horizontal = cv2.erode(horizontal,kernel,iterations = 2) + horizontal = cv2.morphologyEx(horizontal, cv2.MORPH_OPEN, horizontalStructure) + horizontal = cv2.morphologyEx(horizontal, cv2.MORPH_CLOSE, kernel, iterations=2) + # re-insert deleted horizontal contours horizontal = cv2.fillPoly(horizontal, pts=cnts_hor_e, color=255) - rows = vertical.shape[0] - verticalsize = rows // 30 - # Create structure element for extracting vertical lines through morphology operations - verticalStructure = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize)) - # Apply morphology operations - vertical = cv2.erode(vertical, verticalStructure) - vertical = cv2.dilate(vertical, verticalStructure) - vertical = cv2.dilate(vertical,kernel,iterations = 1) + vertical_size = vertical.shape[0] // 30 + # find vertical lines by morphology + verticalStructure = cv2.getStructuringElement(cv2.MORPH_RECT, (1, vertical_size)) + vertical = cv2.morphologyEx(vertical, cv2.MORPH_OPEN, verticalStructure) + vertical = cv2.dilate(vertical, kernel, iterations=1) horizontal, special_separators = \ combine_hor_lines_and_delete_cross_points_and_get_lines_features_back_new( vertical, horizontal, num_col_classifier) - separators_closeup_new[:,:][vertical[:,:]!=0]=1 - separators_closeup_new[:,:][horizontal[:,:]!=0]=1 - _, thresh = cv2.threshold(vertical, 0, 255, 0) - contours_line_vers, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - slope_lines, dist_x, x_min_main, x_max_main, cy_main, slope_lines_org, y_min_main, y_max_main, cx_main = \ - find_features_of_lines(contours_line_vers) + contours_sep_vers, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + slope_seps, dist_x, x_min_seps, x_max_seps, cy_seps, slope_seps_org, y_min_seps, y_max_seps, cx_seps = \ + find_features_of_lines(contours_sep_vers) - args=np.arange(len(slope_lines)) - args_ver=args[slope_lines==1] - dist_x_ver=dist_x[slope_lines==1] - y_min_main_ver=y_min_main[slope_lines==1] - y_max_main_ver=y_max_main[slope_lines==1] - x_min_main_ver=x_min_main[slope_lines==1] - x_max_main_ver=x_max_main[slope_lines==1] - cx_main_ver=cx_main[slope_lines==1] - dist_y_ver=y_max_main_ver-y_min_main_ver + args=np.arange(len(slope_seps)) + args_ver=args[slope_seps==1] + dist_x_ver=dist_x[slope_seps==1] + y_min_seps_ver=y_min_seps[slope_seps==1] + y_max_seps_ver=y_max_seps[slope_seps==1] + x_min_seps_ver=x_min_seps[slope_seps==1] + x_max_seps_ver=x_max_seps[slope_seps==1] + cx_seps_ver=cx_seps[slope_seps==1] + dist_y_ver=y_max_seps_ver-y_min_seps_ver len_y=separators_closeup.shape[0]/3.0 _, thresh = cv2.threshold(horizontal, 0, 255, 0) - contours_line_hors, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - slope_lines, dist_x, x_min_main, x_max_main, cy_main, slope_lines_org, y_min_main, y_max_main, cx_main = \ - find_features_of_lines(contours_line_hors) + contours_sep_hors, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + slope_seps, dist_x, x_min_seps, x_max_seps, cy_seps, slope_seps_org, y_min_seps, y_max_seps, cx_seps = \ + find_features_of_lines(contours_sep_hors) - slope_lines_org_hor=slope_lines_org[slope_lines==0] - args=np.arange(len(slope_lines)) + slope_seps_org_hor=slope_seps_org[slope_seps==0] + args=np.arange(len(slope_seps)) len_x=separators_closeup.shape[1]/5.0 - dist_y=np.abs(y_max_main-y_min_main) + dist_y=np.abs(y_max_seps-y_min_seps) - args_hor=args[slope_lines==0] - dist_x_hor=dist_x[slope_lines==0] - y_min_main_hor=y_min_main[slope_lines==0] - y_max_main_hor=y_max_main[slope_lines==0] - x_min_main_hor=x_min_main[slope_lines==0] - x_max_main_hor=x_max_main[slope_lines==0] - dist_y_hor=dist_y[slope_lines==0] - cy_main_hor=cy_main[slope_lines==0] + args_hor=args[slope_seps==0] + dist_x_hor=dist_x[slope_seps==0] + y_min_seps_hor=y_min_seps[slope_seps==0] + y_max_seps_hor=y_max_seps[slope_seps==0] + x_min_seps_hor=x_min_seps[slope_seps==0] + x_max_seps_hor=x_max_seps[slope_seps==0] + dist_y_hor=dist_y[slope_seps==0] + cy_seps_hor=cy_seps[slope_seps==0] args_hor=args_hor[dist_x_hor>=len_x/2.0] - x_max_main_hor=x_max_main_hor[dist_x_hor>=len_x/2.0] - x_min_main_hor=x_min_main_hor[dist_x_hor>=len_x/2.0] - cy_main_hor=cy_main_hor[dist_x_hor>=len_x/2.0] - y_min_main_hor=y_min_main_hor[dist_x_hor>=len_x/2.0] - y_max_main_hor=y_max_main_hor[dist_x_hor>=len_x/2.0] + x_max_seps_hor=x_max_seps_hor[dist_x_hor>=len_x/2.0] + x_min_seps_hor=x_min_seps_hor[dist_x_hor>=len_x/2.0] + cy_seps_hor=cy_seps_hor[dist_x_hor>=len_x/2.0] + y_min_seps_hor=y_min_seps_hor[dist_x_hor>=len_x/2.0] + y_max_seps_hor=y_max_seps_hor[dist_x_hor>=len_x/2.0] dist_y_hor=dist_y_hor[dist_x_hor>=len_x/2.0] - slope_lines_org_hor=slope_lines_org_hor[dist_x_hor>=len_x/2.0] + slope_seps_org_hor=slope_seps_org_hor[dist_x_hor>=len_x/2.0] dist_x_hor=dist_x_hor[dist_x_hor>=len_x/2.0] - matrix_of_lines_ch=np.zeros((len(cy_main_hor)+len(cx_main_ver),10)) - matrix_of_lines_ch[:len(cy_main_hor),0]=args_hor - matrix_of_lines_ch[len(cy_main_hor):,0]=args_ver - matrix_of_lines_ch[len(cy_main_hor):,1]=cx_main_ver - matrix_of_lines_ch[:len(cy_main_hor),2]=x_min_main_hor+50#x_min_main_hor+150 - matrix_of_lines_ch[len(cy_main_hor):,2]=x_min_main_ver - matrix_of_lines_ch[:len(cy_main_hor),3]=x_max_main_hor-50#x_max_main_hor-150 - matrix_of_lines_ch[len(cy_main_hor):,3]=x_max_main_ver - matrix_of_lines_ch[:len(cy_main_hor),4]=dist_x_hor - matrix_of_lines_ch[len(cy_main_hor):,4]=dist_x_ver - matrix_of_lines_ch[:len(cy_main_hor),5]=cy_main_hor - matrix_of_lines_ch[:len(cy_main_hor),6]=y_min_main_hor - matrix_of_lines_ch[len(cy_main_hor):,6]=y_min_main_ver - matrix_of_lines_ch[:len(cy_main_hor),7]=y_max_main_hor - matrix_of_lines_ch[len(cy_main_hor):,7]=y_max_main_ver - matrix_of_lines_ch[:len(cy_main_hor),8]=dist_y_hor - matrix_of_lines_ch[len(cy_main_hor):,8]=dist_y_ver - matrix_of_lines_ch[len(cy_main_hor):,9]=1 + matrix_of_seps_ch=np.zeros((len(cy_seps_hor)+len(cx_seps_ver),10)) + matrix_of_seps_ch[:len(cy_seps_hor),0]=args_hor + matrix_of_seps_ch[len(cy_seps_hor):,0]=args_ver + matrix_of_seps_ch[len(cy_seps_hor):,1]=cx_seps_ver + matrix_of_seps_ch[:len(cy_seps_hor),2]=x_min_seps_hor+50#x_min_seps_hor+150 + matrix_of_seps_ch[len(cy_seps_hor):,2]=x_min_seps_ver + matrix_of_seps_ch[:len(cy_seps_hor),3]=x_max_seps_hor-50#x_max_seps_hor-150 + matrix_of_seps_ch[len(cy_seps_hor):,3]=x_max_seps_ver + matrix_of_seps_ch[:len(cy_seps_hor),4]=dist_x_hor + matrix_of_seps_ch[len(cy_seps_hor):,4]=dist_x_ver + matrix_of_seps_ch[:len(cy_seps_hor),5]=cy_seps_hor + matrix_of_seps_ch[:len(cy_seps_hor),6]=y_min_seps_hor + matrix_of_seps_ch[len(cy_seps_hor):,6]=y_min_seps_ver + matrix_of_seps_ch[:len(cy_seps_hor),7]=y_max_seps_hor + matrix_of_seps_ch[len(cy_seps_hor):,7]=y_max_seps_ver + matrix_of_seps_ch[:len(cy_seps_hor),8]=dist_y_hor + matrix_of_seps_ch[len(cy_seps_hor):,8]=dist_y_ver + matrix_of_seps_ch[len(cy_seps_hor):,9]=1 if contours_h is not None: - _, dist_x_head, x_min_main_head, x_max_main_head, cy_main_head, _, y_min_main_head, y_max_main_head, _ = \ + _, dist_x_head, x_min_head, x_max_head, cy_head, _, y_min_head, y_max_head, _ = \ find_features_of_lines(contours_h) - matrix_l_n=np.zeros((matrix_of_lines_ch.shape[0]+len(cy_main_head),matrix_of_lines_ch.shape[1])) - matrix_l_n[:matrix_of_lines_ch.shape[0],:]=np.copy(matrix_of_lines_ch[:,:]) - args_head=np.arange(len(cy_main_head)) + len(cy_main_hor) + matrix_l_n = np.zeros((len(cy_head), matrix_of_seps_ch.shape[1])) + args_head = np.arange(len(cy_head)) + matrix_l_n[:, 0] = args_head + matrix_l_n[:, 2] = x_min_head+30 + matrix_l_n[:, 3] = x_max_head-30 + matrix_l_n[:, 4] = dist_x_head + matrix_l_n[:, 5] = y_min_head-3-8 + matrix_l_n[:, 6] = y_min_head-5-8 + matrix_l_n[:, 7] = y_max_head#y_min_head+1-8 + matrix_l_n[:, 8] = 4 + matrix_of_seps_ch = np.append( + matrix_of_seps_ch, matrix_l_n, axis=0) - matrix_l_n[matrix_of_lines_ch.shape[0]:,0]=args_head - matrix_l_n[matrix_of_lines_ch.shape[0]:,2]=x_min_main_head+30 - matrix_l_n[matrix_of_lines_ch.shape[0]:,3]=x_max_main_head-30 - matrix_l_n[matrix_of_lines_ch.shape[0]:,4]=dist_x_head - matrix_l_n[matrix_of_lines_ch.shape[0]:,5]=y_min_main_head-3-8 - matrix_l_n[matrix_of_lines_ch.shape[0]:,6]=y_min_main_head-5-8 - matrix_l_n[matrix_of_lines_ch.shape[0]:,7]=y_max_main_head#y_min_main_head+1-8 - matrix_l_n[matrix_of_lines_ch.shape[0]:,8]=4 - matrix_of_lines_ch=np.copy(matrix_l_n) + cy_seps_splitters=cy_seps_hor[(x_min_seps_hor<=.16*region_pre_p.shape[1]) & + (x_max_seps_hor>=.84*region_pre_p.shape[1])] + cy_seps_splitters = np.append(cy_seps_splitters, special_separators) - cy_main_splitters=cy_main_hor[(x_min_main_hor<=.16*region_pre_p.shape[1]) & - (x_max_main_hor>=.84*region_pre_p.shape[1])] - cy_main_splitters=np.array( list(cy_main_splitters)+list(special_separators)) if contours_h is not None: - try: - cy_main_splitters_head=cy_main_head[(x_min_main_head<=.16*region_pre_p.shape[1]) & - (x_max_main_head>=.84*region_pre_p.shape[1])] - cy_main_splitters=np.array( list(cy_main_splitters)+list(cy_main_splitters_head)) - except: - pass - args_cy_splitter=np.argsort(cy_main_splitters) - cy_main_splitters_sort=cy_main_splitters[args_cy_splitter] + cy_seps_splitters_head=cy_head[(x_min_head<=.16*region_pre_p.shape[1]) & + (x_max_head>=.84*region_pre_p.shape[1])] + cy_seps_splitters = np.append(cy_seps_splitters, cy_seps_splitters_head) - splitter_y_new=[] - splitter_y_new.append(0) - for i in range(len(cy_main_splitters_sort)): - splitter_y_new.append( cy_main_splitters_sort[i] ) - splitter_y_new.append(region_pre_p.shape[0]) - splitter_y_new_diff=np.diff(splitter_y_new)/float(region_pre_p.shape[0])*100 + cy_seps_splitters = np.sort(cy_seps_splitters) + splitter_y_new = [0] + list(cy_seps_splitters) + [region_pre_p.shape[0]] + splitter_y_new_diff = np.diff(splitter_y_new) / float(region_pre_p.shape[0]) * 100 args_big_parts=np.arange(len(splitter_y_new_diff))[ splitter_y_new_diff>22 ] @@ -1573,7 +1547,7 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, peaks_neg_fin=peaks_neg_fin[peaks_neg_fin<=(vertical.shape[1]-500)] peaks_neg_fin_fin=peaks_neg_fin[:] - return num_col_fin, peaks_neg_fin_fin,matrix_of_lines_ch,splitter_y_new,separators_closeup_n + return num_col_fin, peaks_neg_fin_fin, matrix_of_seps_ch, splitter_y_new, separators_closeup_n def return_boxes_of_images_by_order_of_reading_new( splitter_y_new, regions_without_separators, From 5a0e4c3b0f2e089acff0b4fbf058f1d2e6f90f66 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 20 Oct 2025 13:36:10 +0200 Subject: [PATCH 007/118] `find_number_of_columns_in_document`: improve splitter rule extend horizontal separators to full img width if they do not overlap any other regions (only as regards to returned `splitter_y` result, but without changing returned separators mask) --- src/eynollah/utils/__init__.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 168899f..b930bfd 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1378,6 +1378,8 @@ def return_points_with_boundies(peaks_neg_fin, first_point, last_point): return peaks_neg_tot def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, label_seps, contours_h=None): + ncomps, ccomps = cv2.connectedComponents(region_pre_p.astype(np.uint8)) + separators_closeup = 1 * (region_pre_p == label_seps) separators_closeup[0:110] = 0 separators_closeup[-150:] = 0 @@ -1398,10 +1400,19 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, min_xe = cnt[:, 0, 0].min() max_ye = cnt[:, 0, 1].max() min_ye = cnt[:, 0, 1].min() + med_ye = int(np.median(cnt[:, 0, 1])) dist_xe = max_xe - min_xe dist_ye = max_ye - min_ye if dist_ye <= 50 and dist_xe >= 3 * dist_ye: cnts_hor_e.append(cnt) + labels = np.setdiff1d(np.unique(ccomps[med_ye]), [0]) + if len(labels) == 1: + # mid line does not intersect with any other region + # so add it as extra splitter line + cnts_hor_e.append(np.array([[[0, med_ye]], + [[ccomps.shape[1], med_ye]], + [[ccomps.shape[1], med_ye + 1]], + [[0, med_ye + 1]]])) # delete horizontal contours (leaving only the edges) separators_closeup_n_binary = cv2.fillPoly(separators_closeup_n_binary, pts=cnts_hor_e, color=0) From cd35241e816acc7e2083dc31d99f376a8877904b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 20 Oct 2025 13:41:36 +0200 Subject: [PATCH 008/118] `find_number_of_columns_in_document`: split headings at top+baseline regarding `splitter_y` result, for headings, instead of cutting right through them via center line, add their toplines and baselines as if they were horizontal separators --- src/eynollah/utils/__init__.py | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index b930bfd..0c3e4ae 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1506,15 +1506,33 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, if contours_h is not None: _, dist_x_head, x_min_head, x_max_head, cy_head, _, y_min_head, y_max_head, _ = \ find_features_of_lines(contours_h) + # matrix_l_n = np.zeros((len(cy_head), matrix_of_seps_ch.shape[1])) + # args_head = np.arange(len(cy_head)) + # matrix_l_n[:, 0] = args_head + # matrix_l_n[:, 2] = x_min_head+30 + # matrix_l_n[:, 3] = x_max_head-30 + # matrix_l_n[:, 4] = dist_x_head + # matrix_l_n[:, 5] = y_min_head-3-8 + # matrix_l_n[:, 6] = y_min_head-5-8 + # matrix_l_n[:, 7] = y_max_head#y_min_head+1-8 + # matrix_l_n[:, 8] = 4 + # split at toplines (y_min_head) and baselines (y_max_head) instead of center (cy_head): + cy_head = np.stack((y_min_head, y_max_head)).T.flatten() + y_min_head, y_max_head = (np.stack((y_min_head - 2, y_max_head - 2)).T.flatten(), + np.stack((y_min_head + 2, y_max_head + 2)).T.flatten()) + x_min_head = np.repeat(x_min_head, 2) + x_max_head = np.repeat(x_max_head, 2) + dist_x_head = np.repeat(dist_x_head, 2) matrix_l_n = np.zeros((len(cy_head), matrix_of_seps_ch.shape[1])) args_head = np.arange(len(cy_head)) matrix_l_n[:, 0] = args_head - matrix_l_n[:, 2] = x_min_head+30 - matrix_l_n[:, 3] = x_max_head-30 + # +/- 30px to avoid crossing col peaks by accident + matrix_l_n[:, 2] = x_min_head + 30 + matrix_l_n[:, 3] = x_max_head - 30 matrix_l_n[:, 4] = dist_x_head - matrix_l_n[:, 5] = y_min_head-3-8 - matrix_l_n[:, 6] = y_min_head-5-8 - matrix_l_n[:, 7] = y_max_head#y_min_head+1-8 + matrix_l_n[:, 5] = cy_head + matrix_l_n[:, 6] = y_min_head + matrix_l_n[:, 7] = y_max_head matrix_l_n[:, 8] = 4 matrix_of_seps_ch = np.append( matrix_of_seps_ch, matrix_l_n, axis=0) From 7c3e41858877211c82f5b6c91a02fccfe146cacb Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 20 Oct 2025 16:13:51 +0200 Subject: [PATCH 009/118] `return_boxes_of_images_by_order_of_reading_new`: simplify - enumeration instead of indexing - array instead of list operations - add better plotting (but commented out) --- src/eynollah/utils/__init__.py | 349 ++++++++++++++++----------------- 1 file changed, 165 insertions(+), 184 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 0c3e4ae..698b0bd 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -5,6 +5,7 @@ import math try: import matplotlib.pyplot as plt + import matplotlib.patches as patches except ImportError: plt = None import numpy as np @@ -20,6 +21,7 @@ from .contour import (contours_in_same_horizon, return_contours_of_image, return_parent_contours) + def pairwise(iterable): # pairwise('ABCDEFG') → AB BC CD DE EF FG @@ -205,15 +207,15 @@ def return_x_start_end_mothers_childs_and_type_of_reading_order( #print(x_end,'x_end') #print(len_sep) - deleted=[] + deleted = set() for i in range(len(x_start)-1): nodes_i=set(range(x_start[i],x_end[i]+1)) for j in range(i+1,len(x_start)): if nodes_i==set(range(x_start[j],x_end[j]+1)): - deleted.append(j) + deleted.add(j) #print(np.unique(deleted)) - remained_sep_indexes=set(range(len(x_start)))-set(np.unique(deleted) ) + remained_sep_indexes = set(range(len(x_start))) - deleted #print(remained_sep_indexes,'remained_sep_indexes') mother=[]#if it has mother child=[] @@ -262,7 +264,7 @@ def return_x_start_end_mothers_childs_and_type_of_reading_order( x_start_with_child_without_mother = x_start[remained_sep_indexes_with_child_without_mother] y_lines_with_child_without_mother = y_sep[remained_sep_indexes_with_child_without_mother] - reading_orther_type=0 + reading_order_type=0 x_end_without_mother = x_end[remained_sep_indexes_without_mother] x_start_without_mother = x_start[remained_sep_indexes_without_mother] y_lines_without_mother = y_sep[remained_sep_indexes_without_mother] @@ -278,12 +280,11 @@ def return_x_start_end_mothers_childs_and_type_of_reading_order( x_end[remained_sep_indexes_without_mother[j]] # + 1 )) - set_diff = nodes_i - nodes_j - if set_diff != nodes_i: - reading_orther_type = 1 + if nodes_i - nodes_j != nodes_i: + reading_order_type = 1 else: - reading_orther_type = 0 - #print(reading_orther_type,'javab') + reading_order_type = 0 + #print(reading_order_type,'javab') #print(y_lines_with_child_without_mother,'y_lines_with_child_without_mother') #print(x_start_with_child_without_mother,'x_start_with_child_without_mother') #print(x_end_with_child_without_mother,'x_end_with_hild_without_mother') @@ -297,7 +298,7 @@ def return_x_start_end_mothers_childs_and_type_of_reading_order( #print(all_args_uniq,'all_args_uniq') #print(args_to_be_unified,'args_to_be_unified') - return (reading_orther_type, + return (reading_order_type, x_start_returned, x_end_returned, y_sep_returned, @@ -1590,77 +1591,90 @@ def return_boxes_of_images_by_order_of_reading_new( if logger is None: logger = getLogger(__package__) logger.debug('enter return_boxes_of_images_by_order_of_reading_new') + # def dbg_plt(box=None, title=None): + # if box is None: + # box = [None, None, None, None] + # img = regions_without_separators[box[2]:box[3], box[0]:box[1]] + # plt.imshow(img) + # xrange = np.arange(0, img.shape[1], 100) + # yrange = np.arange(0, img.shape[0], 100) + # plt.gca().set_xticks(xrange, xrange + (box[0] or 0)) + # plt.gca().set_yticks(yrange, yrange + (box[2] or 0)) + # if title: + # plt.title(title) + # plt.show() + # dbg_plt() boxes=[] peaks_neg_tot_tables = [] splitter_y_new = np.array(splitter_y_new, dtype=int) - for i in range(len(splitter_y_new)-1): - #print(splitter_y_new[i],splitter_y_new[i+1]) - matrix_new = matrix_of_lines_ch[:,:][(matrix_of_lines_ch[:,6]> splitter_y_new[i] ) & - (matrix_of_lines_ch[:,7]< splitter_y_new[i+1] )] + width_tot = regions_without_separators.shape[1] + for top, bot in pairwise(splitter_y_new): + # print("%d:%d" % (top, bot), 'i') + # dbg_plt([None, None, top, bot], + # "image cut for y split %d:%d" % ( + # top, bot)) + matrix_new = matrix_of_lines_ch[(matrix_of_lines_ch[:,6] > top) & + (matrix_of_lines_ch[:,7] < bot)] #print(len( matrix_new[:,9][matrix_new[:,9]==1] )) #print(matrix_new[:,8][matrix_new[:,9]==1],'gaddaaa') # check to see is there any vertical separator to find holes. #if (len(matrix_new[:,9][matrix_new[:,9]==1]) > 0 and # np.max(matrix_new[:,8][matrix_new[:,9]==1]) >= - # 0.1 * (np.abs(splitter_y_new[i+1]-splitter_y_new[i]))): + # 0.1 * (np.abs(bot-top))): if True: try: num_col, peaks_neg_fin = find_num_col( - regions_without_separators[splitter_y_new[i]:splitter_y_new[i+1], :], + regions_without_separators[top:bot], num_col_classifier, tables, multiplier=6. if erosion_hurts else 7.) except: peaks_neg_fin=[] num_col = 0 try: if (len(peaks_neg_fin)+1)=len(peaks_neg_fin2): - peaks_neg_fin=list(np.copy(peaks_neg_fin1)) + peaks_neg_fin2 = [] + if len(peaks_neg_fin1) >= len(peaks_neg_fin2): + peaks_neg_fin = peaks_neg_fin1 else: - peaks_neg_fin=list(np.copy(peaks_neg_fin2)) - peaks_neg_fin=list(np.array(peaks_neg_fin)+peaks_neg_fin_early[i_n]) - - if i_n!=(len(peaks_neg_fin_early)-2): - peaks_neg_fin_rev.append(peaks_neg_fin_early[i_n+1]) + peaks_neg_fin = peaks_neg_fin2 + peaks_neg_fin = list(np.array(peaks_neg_fin) + left) #print(peaks_neg_fin,'peaks_neg_fin') - peaks_neg_fin_rev=peaks_neg_fin_rev+peaks_neg_fin + + if right < peaks_neg_fin_early[-1]: + peaks_neg_fin_rev.append(right) + peaks_neg_fin_rev.extend(peaks_neg_fin) if len(peaks_neg_fin_rev)>=len(peaks_neg_fin_org): peaks_neg_fin=list(np.sort(peaks_neg_fin_rev)) @@ -1673,21 +1687,20 @@ def return_boxes_of_images_by_order_of_reading_new( except: logger.exception("cannot find peaks consistent with columns") #num_col, peaks_neg_fin = find_num_col( - # regions_without_separators[splitter_y_new[i]:splitter_y_new[i+1],:], + # regions_without_separators[top:bot,:], # multiplier=7.0) x_min_hor_some=matrix_new[:,2][ (matrix_new[:,9]==0) ] x_max_hor_some=matrix_new[:,3][ (matrix_new[:,9]==0) ] cy_hor_some=matrix_new[:,5][ (matrix_new[:,9]==0) ] cy_hor_diff=matrix_new[:,7][ (matrix_new[:,9]==0) ] - arg_org_hor_some=matrix_new[:,0][ (matrix_new[:,9]==0) ] if right2left_readingorder: - x_max_hor_some_new = regions_without_separators.shape[1] - x_min_hor_some - x_min_hor_some_new = regions_without_separators.shape[1] - x_max_hor_some + x_max_hor_some_new = width_tot - x_min_hor_some + x_min_hor_some_new = width_tot - x_max_hor_some x_min_hor_some =list(np.copy(x_min_hor_some_new)) x_max_hor_some =list(np.copy(x_max_hor_some_new)) - peaks_neg_tot=return_points_with_boundies(peaks_neg_fin,0, regions_without_separators[:,:].shape[1]) + peaks_neg_tot = [0] + peaks_neg_fin + [width_tot] peaks_neg_tot_tables.append(peaks_neg_tot) reading_order_type, x_starting, x_ending, y_type_2, y_diff_type_2, \ @@ -1697,26 +1710,27 @@ def return_boxes_of_images_by_order_of_reading_new( x_min_hor_some, x_max_hor_some, cy_hor_some, peaks_neg_tot, cy_hor_diff) all_columns = set(range(len(peaks_neg_tot) - 1)) - if ((reading_order_type==1) or - (reading_order_type==0 and - (len(y_lines_without_mother)>=2 or there_is_sep_with_child==1))): + # print("all_columns", all_columns) + if (reading_order_type == 1 or + len(y_lines_without_mother) >= 2 or + there_is_sep_with_child == 1): try: - y_grenze = splitter_y_new[i] + 300 + y_grenze = top + 300 #check if there is a big separator in this y_mains_sep_ohne_grenzen args_early_ys=np.arange(len(y_type_2)) #print(args_early_ys,'args_early_ys') - #print(splitter_y_new[i], splitter_y_new[i+1]) + #print(top, bot) - x_starting_up = x_starting[(y_type_2 > splitter_y_new[i]) & + x_starting_up = x_starting[(y_type_2 > top) & (y_type_2 <= y_grenze)] - x_ending_up = x_ending[(y_type_2 > splitter_y_new[i]) & + x_ending_up = x_ending[(y_type_2 > top) & (y_type_2 <= y_grenze)] - y_type_2_up = y_type_2[(y_type_2 > splitter_y_new[i]) & + y_type_2_up = y_type_2[(y_type_2 > top) & (y_type_2 <= y_grenze)] - y_diff_type_2_up = y_diff_type_2[(y_type_2 > splitter_y_new[i]) & + y_diff_type_2_up = y_diff_type_2[(y_type_2 > top) & (y_type_2 <= y_grenze)] - args_up = args_early_ys[(y_type_2 > splitter_y_new[i]) & + args_up = args_early_ys[(y_type_2 > top) & (y_type_2 <= y_grenze)] if len(y_type_2_up) > 0: y_main_separator_up = y_type_2_up [(x_starting_up==0) & @@ -1730,27 +1744,28 @@ def return_boxes_of_images_by_order_of_reading_new( args_to_be_kept = np.array(list( set(args_early_ys) - set(args_main_to_deleted) )) #print(args_to_be_kept,'args_to_be_kept') boxes.append([0, peaks_neg_tot[len(peaks_neg_tot)-1], - splitter_y_new[i], y_diff_main_separator_up.max()]) - splitter_y_new[i] = y_diff_main_separator_up.max() + top, y_diff_main_separator_up.max()]) + # dbg_plt(boxes[-1], "first box") + top = y_diff_main_separator_up.max() - #print(splitter_y_new[i],'splitter_y_new[i]') + #print(top,'top') y_type_2 = y_type_2[args_to_be_kept] x_starting = x_starting[args_to_be_kept] x_ending = x_ending[args_to_be_kept] y_diff_type_2 = y_diff_type_2[args_to_be_kept] #print('galdiha') - y_grenze = splitter_y_new[i] + 200 + y_grenze = top + 200 args_early_ys2=np.arange(len(y_type_2)) - y_type_2_up=y_type_2[(y_type_2 > splitter_y_new[i]) & + y_type_2_up=y_type_2[(y_type_2 > top) & (y_type_2 <= y_grenze)] - x_starting_up=x_starting[(y_type_2 > splitter_y_new[i]) & + x_starting_up=x_starting[(y_type_2 > top) & (y_type_2 <= y_grenze)] - x_ending_up=x_ending[(y_type_2 > splitter_y_new[i]) & + x_ending_up=x_ending[(y_type_2 > top) & (y_type_2 <= y_grenze)] - y_diff_type_2_up=y_diff_type_2[(y_type_2 > splitter_y_new[i]) & + y_diff_type_2_up=y_diff_type_2[(y_type_2 > top) & (y_type_2 <= y_grenze)] - args_up2=args_early_ys2[(y_type_2 > splitter_y_new[i]) & + args_up2=args_early_ys2[(y_type_2 > top) & (y_type_2 <= y_grenze)] #print(y_type_2_up,x_starting_up,x_ending_up,'didid') nodes_in = set() @@ -1804,13 +1819,14 @@ def return_boxes_of_images_by_order_of_reading_new( pass #print('burdaydikh2') - #int(splitter_y_new[i]) + #int(top) y_lines_by_order=[] x_start_by_order=[] x_end_by_order=[] - if (len(x_end_with_child_without_mother)==0 and reading_order_type==0) or reading_order_type==1: - if reading_order_type==1: - y_lines_by_order.append(splitter_y_new[i]) + if (reading_order_type == 1 or + len(x_end_with_child_without_mother) == 0): + if reading_order_type == 1: + y_lines_by_order.append(top) x_start_by_order.append(0) x_end_by_order.append(len(peaks_neg_tot)-2) else: @@ -1823,8 +1839,8 @@ def return_boxes_of_images_by_order_of_reading_new( columns_not_covered = list(all_columns - columns_covered_by_mothers) y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered) + len(x_start_without_mother), - dtype=int) * splitter_y_new[i]) - ##y_lines_by_order = np.append(y_lines_by_order, [splitter_y_new[i]] * len(columns_not_covered)) + dtype=int) * top) + ##y_lines_by_order = np.append(y_lines_by_order, [top] * len(columns_not_covered)) ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) x_starting = np.append(x_starting, np.array(columns_not_covered, int)) x_starting = np.append(x_starting, x_start_without_mother) @@ -1839,22 +1855,15 @@ def return_boxes_of_images_by_order_of_reading_new( ind_args_in_col=ind_args[x_starting==column] #print('babali2') #print(ind_args_in_col,'ind_args_in_col') - ind_args_in_col=np.array(ind_args_in_col) #print(len(y_type_2)) y_column=y_type_2[ind_args_in_col] x_start_column=x_starting[ind_args_in_col] x_end_column=x_ending[ind_args_in_col] #print('babali3') ind_args_col_sorted=np.argsort(y_column) - y_col_sort=y_column[ind_args_col_sorted] - x_start_column_sort=x_start_column[ind_args_col_sorted] - x_end_column_sort=x_end_column[ind_args_col_sorted] - #print('babali4') - for ii in range(len(y_col_sort)): - #print('babali5') - y_lines_by_order.append(y_col_sort[ii]) - x_start_by_order.append(x_start_column_sort[ii]) - x_end_by_order.append(x_end_column_sort[ii]-1) + y_lines_by_order.extend(y_column[ind_args_col_sorted]) + x_start_by_order.extend(x_start_column[ind_args_col_sorted]) + x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) else: #print(x_start_without_mother,x_end_without_mother,peaks_neg_tot,'dodo') columns_covered_by_mothers = set() @@ -1864,8 +1873,8 @@ def return_boxes_of_images_by_order_of_reading_new( x_end_without_mother[dj])) columns_not_covered = list(all_columns - columns_covered_by_mothers) y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered) + len(x_start_without_mother), - dtype=int) * splitter_y_new[i]) - ##y_lines_by_order = np.append(y_lines_by_order, [splitter_y_new[i]] * len(columns_not_covered)) + dtype=int) * top) + ##y_lines_by_order = np.append(y_lines_by_order, [top] * len(columns_not_covered)) ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) x_starting = np.append(x_starting, np.array(columns_not_covered, int)) x_starting = np.append(x_starting, x_start_without_mother) @@ -1888,25 +1897,24 @@ def return_boxes_of_images_by_order_of_reading_new( x_start_with_child_without_mother = np.array(x_start_with_child_without_mother, int) for i_s_nc in columns_not_covered_child_no_mother: if i_s_nc in x_start_with_child_without_mother: + #print("i_s_nc", i_s_nc) x_end_biggest_column = \ x_end_with_child_without_mother[x_start_with_child_without_mother==i_s_nc][0] args_all_biggest_lines = ind_args[(x_starting==i_s_nc) & (x_ending==x_end_biggest_column)] y_column_nc = y_type_2[args_all_biggest_lines] - x_start_column_nc = x_starting[args_all_biggest_lines] - x_end_column_nc = x_ending[args_all_biggest_lines] + #x_start_column_nc = x_starting[args_all_biggest_lines] + #x_end_column_nc = x_ending[args_all_biggest_lines] y_column_nc = np.sort(y_column_nc) for i_c in range(len(y_column_nc)): - if i_c==(len(y_column_nc)-1): - ind_all_lines_between_nm_wc=ind_args[(y_type_2>y_column_nc[i_c]) & - (y_type_2=i_s_nc) & - (x_ending<=x_end_biggest_column)] - else: - ind_all_lines_between_nm_wc=ind_args[(y_type_2>y_column_nc[i_c]) & - (y_type_2=i_s_nc) & - (x_ending<=x_end_biggest_column)] + #print("i_c", i_c) + ind_all_lines_between_nm_wc = \ + ind_args[(y_type_2 > y_column_nc[i_c]) & + (y_type_2 < (y_column_nc[i_c+1] + if i_c < len(y_column_nc)-1 + else bot)) & + (x_starting >= i_s_nc) & + (x_ending <= x_end_biggest_column)] y_all_between_nm_wc = y_type_2[ind_all_lines_between_nm_wc] x_starting_all_between_nm_wc = x_starting[ind_all_lines_between_nm_wc] x_ending_all_between_nm_wc = x_ending[ind_all_lines_between_nm_wc] @@ -1965,78 +1973,58 @@ def return_boxes_of_images_by_order_of_reading_new( ind_args_in_col=ind_args_between[x_starting_all_between_nm_wc==column] #print('babali2') #print(ind_args_in_col,'ind_args_in_col') - ind_args_in_col=np.array(ind_args_in_col) #print(len(y_type_2)) y_column=y_all_between_nm_wc[ind_args_in_col] x_start_column=x_starting_all_between_nm_wc[ind_args_in_col] x_end_column=x_ending_all_between_nm_wc[ind_args_in_col] #print('babali3') ind_args_col_sorted=np.argsort(y_column) - y_col_sort=y_column[ind_args_col_sorted] - x_start_column_sort=x_start_column[ind_args_col_sorted] - x_end_column_sort=x_end_column[ind_args_col_sorted] - #print('babali4') - for ii in range(len(y_col_sort)): - #print('babali5') - y_lines_by_order.append(y_col_sort[ii]) - x_start_by_order.append(x_start_column_sort[ii]) - x_end_by_order.append(x_end_column_sort[ii]-1) + y_lines_by_order.extend(y_column[ind_args_col_sorted]) + x_start_by_order.extend(x_start_column[ind_args_col_sorted]) + x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) else: #print(column,'column') ind_args_in_col=ind_args[x_starting==i_s_nc] #print('babali2') #print(ind_args_in_col,'ind_args_in_col') - ind_args_in_col=np.array(ind_args_in_col) #print(len(y_type_2)) y_column=y_type_2[ind_args_in_col] x_start_column=x_starting[ind_args_in_col] x_end_column=x_ending[ind_args_in_col] #print('babali3') - ind_args_col_sorted=np.argsort(y_column) - y_col_sort=y_column[ind_args_col_sorted] - x_start_column_sort=x_start_column[ind_args_col_sorted] - x_end_column_sort=x_end_column[ind_args_col_sorted] - #print('babali4') - for ii in range(len(y_col_sort)): - y_lines_by_order.append(y_col_sort[ii]) - x_start_by_order.append(x_start_column_sort[ii]) - x_end_by_order.append(x_end_column_sort[ii]-1) + ind_args_col_sorted = np.argsort(y_column) + y_lines_by_order.extend(y_column[ind_args_col_sorted]) + x_start_by_order.extend(x_start_column[ind_args_col_sorted]) + x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) + y_lines_by_order = np.array(y_lines_by_order) + x_start_by_order = np.array(x_start_by_order) + x_end_by_order = np.array(x_end_by_order) for il in range(len(y_lines_by_order)): - y_copy = list(y_lines_by_order) - x_start_copy = list(x_start_by_order) - x_end_copy = list(x_end_by_order) - - #print(y_copy,'y_copy') - y_itself=y_copy.pop(il) - x_start_itself=x_start_copy.pop(il) - x_end_itself=x_end_copy.pop(il) - - #print(y_copy,'y_copy2') + #print(il, "il") + y_itself = y_lines_by_order[il] + x_start_itself = x_start_by_order[il] + x_end_itself = x_end_by_order[il] for column in range(int(x_start_itself), int(x_end_itself)+1): #print(column,'cols') - y_in_cols=[] - for yic in range(len(y_copy)): - #print('burda') - if (y_copy[yic]>y_itself and - column>=x_start_copy[yic] and - column<=x_end_copy[yic]): - y_in_cols.append(y_copy[yic]) + y_in_cols = y_lines_by_order[(y_itself < y_lines_by_order) & + (column >= x_start_by_order) & + (column <= x_end_by_order)] + #print('burda') + y_down = y_in_cols.min(initial=bot) #print('burda2') #print(y_in_cols,'y_in_cols') - if len(y_in_cols)>0: - y_down=np.min(y_in_cols) - else: - y_down=splitter_y_new[i+1] #print(y_itself,'y_itself') boxes.append([peaks_neg_tot[column], peaks_neg_tot[column+1], y_itself, y_down]) + # dbg_plt(boxes[-1], "A column %d box" % (column + 1)) except: logger.exception("cannot assign boxes") boxes.append([0, peaks_neg_tot[len(peaks_neg_tot)-1], - splitter_y_new[i], splitter_y_new[i+1]]) + top, bot]) + # dbg_plt(boxes[-1], "fallback box") else: y_lines_by_order=[] x_start_by_order=[] @@ -2050,8 +2038,8 @@ def return_boxes_of_images_by_order_of_reading_new( columns_not_covered = list(all_columns - columns_covered_by_lines_covered_more_than_2col) y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered) + 1, - dtype=int) * splitter_y_new[i]) - ##y_lines_by_order = np.append(y_lines_by_order, [splitter_y_new[i]] * len(columns_not_covered)) + dtype=int) * top) + ##y_lines_by_order = np.append(y_lines_by_order, [top] * len(columns_not_covered)) ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype)) x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1) @@ -2064,8 +2052,8 @@ def return_boxes_of_images_by_order_of_reading_new( else: columns_not_covered = list(all_columns) y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered), - dtype=int) * splitter_y_new[i]) - ##y_lines_by_order = np.append(y_lines_by_order, [splitter_y_new[i]] * len(columns_not_covered)) + dtype=int) * top) + ##y_lines_by_order = np.append(y_lines_by_order, [top] * len(columns_not_covered)) ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype)) x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1) @@ -2075,71 +2063,64 @@ def return_boxes_of_images_by_order_of_reading_new( for column in range(len(peaks_neg_tot)-1): #print(column,'column') ind_args_in_col=ind_args[x_starting==column] - ind_args_in_col=np.array(ind_args_in_col) #print(len(y_type_2)) y_column=y_type_2[ind_args_in_col] x_start_column=x_starting[ind_args_in_col] x_end_column=x_ending[ind_args_in_col] - ind_args_col_sorted=np.argsort(y_column) - y_col_sort=y_column[ind_args_col_sorted] - x_start_column_sort=x_start_column[ind_args_col_sorted] - x_end_column_sort=x_end_column[ind_args_col_sorted] - #print('babali4') - for ii in range(len(y_col_sort)): - #print('babali5') - y_lines_by_order.append(y_col_sort[ii]) - x_start_by_order.append(x_start_column_sort[ii]) - x_end_by_order.append(x_end_column_sort[ii]-1) + ind_args_col_sorted = np.argsort(y_column) + y_lines_by_order.extend(y_column[ind_args_col_sorted]) + x_start_by_order.extend(x_start_column[ind_args_col_sorted]) + x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) + y_lines_by_order = np.array(y_lines_by_order) + x_start_by_order = np.array(x_start_by_order) + x_end_by_order = np.array(x_end_by_order) for il in range(len(y_lines_by_order)): - y_copy = list(y_lines_by_order) - x_start_copy = list(x_start_by_order) - x_end_copy = list(x_end_by_order) - - #print(y_copy,'y_copy') - y_itself=y_copy.pop(il) - x_start_itself=x_start_copy.pop(il) - x_end_itself=x_end_copy.pop(il) - + #print(il, "il") + y_itself = y_lines_by_order[il] + #print(y_itself,'y_itself') + x_start_itself = x_start_by_order[il] + x_end_itself = x_end_by_order[il] for column in range(x_start_itself, x_end_itself+1): #print(column,'cols') - y_in_cols=[] - for yic in range(len(y_copy)): - #print('burda') - if (y_copy[yic]>y_itself and - column>=x_start_copy[yic] and - column<=x_end_copy[yic]): - y_in_cols.append(y_copy[yic]) + y_in_cols = y_lines_by_order[(y_itself < y_lines_by_order) & + (column >= x_start_by_order) & + (column <= x_end_by_order)] #print('burda2') #print(y_in_cols,'y_in_cols') - if len(y_in_cols)>0: - y_down=np.min(y_in_cols) - else: - y_down=splitter_y_new[i+1] - #print(y_itself,'y_itself') + y_down = y_in_cols.min(initial=bot) + #print(y_down,'y_down') boxes.append([peaks_neg_tot[column], peaks_neg_tot[column+1], y_itself, y_down]) + # dbg_plt(boxes[-1], "B column %d box" % (column + 1)) #else: - #boxes.append([ 0, regions_without_separators[:,:].shape[1] ,splitter_y_new[i],splitter_y_new[i+1]]) + #boxes.append([ 0, regions_without_separators[:,:].shape[1] ,top, bot]) if right2left_readingorder: peaks_neg_tot_tables_new = [] if len(peaks_neg_tot_tables)>=1: for peaks_tab_ind in peaks_neg_tot_tables: - peaks_neg_tot_tables_ind = regions_without_separators.shape[1] - np.array(peaks_tab_ind) + peaks_neg_tot_tables_ind = width_tot - np.array(peaks_tab_ind) peaks_neg_tot_tables_ind = list(peaks_neg_tot_tables_ind[::-1]) peaks_neg_tot_tables_new.append(peaks_neg_tot_tables_ind) for i in range(len(boxes)): - x_start_new = regions_without_separators.shape[1] - boxes[i][1] - x_end_new = regions_without_separators.shape[1] - boxes[i][0] + x_start_new = width_tot - boxes[i][1] + x_end_new = width_tot - boxes[i][0] boxes[i][0] = x_start_new boxes[i][1] = x_end_new peaks_neg_tot_tables = peaks_neg_tot_tables_new + # show final xy-cut + # plt.imshow(regions_without_separators) + # for xmin, xmax, ymin, ymax in boxes: + # plt.gca().add_patch(patches.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, + # fill=False, linewidth=1, edgecolor='r')) + # plt.show() + logger.debug('exit return_boxes_of_images_by_order_of_reading_new') return boxes, peaks_neg_tot_tables From 0fc4b2535dc005612406cd4ffbf2471a5b4e1485 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 20 Oct 2025 16:47:35 +0200 Subject: [PATCH 010/118] `return_boxes_of_images_by_order_of_reading_new`: fix no-mother case - when handling lines without mother, and biggest line already accounts for all columns, but some are too close to the top and therefore must be removed, avoid invalidating `biggest` index, causing `IndexError` - remove try-catch (now unnecessary) - array instead of list operations --- src/eynollah/utils/__init__.py | 62 ++++++++++++++++------------------ 1 file changed, 29 insertions(+), 33 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 698b0bd..b331cab 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1919,54 +1919,50 @@ def return_boxes_of_images_by_order_of_reading_new( x_starting_all_between_nm_wc = x_starting[ind_all_lines_between_nm_wc] x_ending_all_between_nm_wc = x_ending[ind_all_lines_between_nm_wc] - x_diff_all_between_nm_wc = x_ending_all_between_nm_wc - x_starting_all_between_nm_wc - if len(x_diff_all_between_nm_wc)>0: - biggest=np.argmax(x_diff_all_between_nm_wc) - columns_covered_by_mothers = set() - for dj in range(len(x_starting_all_between_nm_wc)): + for dj in range(len(ind_all_lines_between_nm_wc)): columns_covered_by_mothers.update( range(x_starting_all_between_nm_wc[dj], x_ending_all_between_nm_wc[dj])) child_columns = set(range(i_s_nc, x_end_biggest_column)) columns_not_covered = list(child_columns - columns_covered_by_mothers) - should_longest_line_be_extended=0 - if (len(x_diff_all_between_nm_wc) > 0 and - set(list(range(x_starting_all_between_nm_wc[biggest], - x_ending_all_between_nm_wc[biggest])) + - list(columns_not_covered)) != child_columns): - should_longest_line_be_extended=1 - index_lines_so_close_to_top_separator = \ - np.arange(len(y_all_between_nm_wc))[(y_all_between_nm_wc>y_column_nc[i_c]) & - (y_all_between_nm_wc<=(y_column_nc[i_c]+500))] - if len(index_lines_so_close_to_top_separator) > 0: - indexes_remained_after_deleting_closed_lines= \ - np.array(list(set(list(range(len(y_all_between_nm_wc)))) - - set(list(index_lines_so_close_to_top_separator)))) - if len(indexes_remained_after_deleting_closed_lines) > 0: + if len(ind_all_lines_between_nm_wc): + biggest = np.argmax(x_ending_all_between_nm_wc - + x_starting_all_between_nm_wc) + if columns_covered_by_mothers == set( + range(x_starting_all_between_nm_wc[biggest], + x_ending_all_between_nm_wc[biggest])): + # biggest accounts for all columns alone, + # longest line should be extended + lines_so_close_to_top_separator = \ + ((y_all_between_nm_wc > y_column_nc[i_c]) & + (y_all_between_nm_wc <= y_column_nc[i_c] + 500)) + if (np.count_nonzero(lines_so_close_to_top_separator) and + np.count_nonzero(lines_so_close_to_top_separator) < + len(ind_all_lines_between_nm_wc)): y_all_between_nm_wc = \ - y_all_between_nm_wc[indexes_remained_after_deleting_closed_lines] + y_all_between_nm_wc[~lines_so_close_to_top_separator] x_starting_all_between_nm_wc = \ - x_starting_all_between_nm_wc[indexes_remained_after_deleting_closed_lines] + x_starting_all_between_nm_wc[~lines_so_close_to_top_separator] x_ending_all_between_nm_wc = \ - x_ending_all_between_nm_wc[indexes_remained_after_deleting_closed_lines] + x_ending_all_between_nm_wc[~lines_so_close_to_top_separator] - y_all_between_nm_wc = np.append(y_all_between_nm_wc, y_column_nc[i_c]) - x_starting_all_between_nm_wc = np.append(x_starting_all_between_nm_wc, i_s_nc) - x_ending_all_between_nm_wc = np.append(x_ending_all_between_nm_wc, x_end_biggest_column) - - if len(x_diff_all_between_nm_wc) > 0: - try: + y_all_between_nm_wc = np.append(y_all_between_nm_wc, y_column_nc[i_c]) + x_starting_all_between_nm_wc = np.append(x_starting_all_between_nm_wc, i_s_nc) + x_ending_all_between_nm_wc = np.append(x_ending_all_between_nm_wc, x_end_biggest_column) + else: y_all_between_nm_wc = np.append(y_all_between_nm_wc, y_column_nc[i_c]) x_starting_all_between_nm_wc = np.append(x_starting_all_between_nm_wc, x_starting_all_between_nm_wc[biggest]) x_ending_all_between_nm_wc = np.append(x_ending_all_between_nm_wc, x_ending_all_between_nm_wc[biggest]) - except: - logger.exception("cannot append") - y_all_between_nm_wc = np.append(y_all_between_nm_wc, [y_column_nc[i_c]] * len(columns_not_covered)) - x_starting_all_between_nm_wc = np.append(x_starting_all_between_nm_wc, np.array(columns_not_covered, int)) - x_ending_all_between_nm_wc = np.append(x_ending_all_between_nm_wc, np.array(columns_not_covered, int) + 1) + if len(columns_not_covered): + y_all_between_nm_wc = np.append( + y_all_between_nm_wc, [y_column_nc[i_c]] * len(columns_not_covered)) + x_starting_all_between_nm_wc = np.append( + x_starting_all_between_nm_wc, np.array(columns_not_covered, int)) + x_ending_all_between_nm_wc = np.append( + x_ending_all_between_nm_wc, np.array(columns_not_covered, int) + 1) ind_args_between=np.arange(len(x_ending_all_between_nm_wc)) for column in range(int(i_s_nc), int(x_end_biggest_column)): From e2dfec75fbefe3e5aeffd71a7a61eab6092f6c92 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 24 Oct 2025 01:19:20 +0200 Subject: [PATCH 011/118] `return_x_start_end_mothers_childs_and_type_of_reading_order`: simplify and document MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - simplify - rename identifiers to make readable: - `y_sep` → `y_mid` (because the cy gets passed) - `y_diff` → `y_max` (because the ymax gets passed) - array instead of list operations - add docstring and in-line comments - return (zero-length) numpy array instead of empty list --- src/eynollah/eynollah.py | 10 +- src/eynollah/utils/__init__.py | 378 +++++++++++++++++---------------- 2 files changed, 198 insertions(+), 190 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 9412861..08ffed7 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -2507,6 +2507,7 @@ class Eynollah: My_main[ii] < box[3])): arg_text_con_main[ii] = jj check_if_textregion_located_in_a_box = True + #print("main/matched", (mx_main[ii], Mx_main[ii], my_main[ii], My_main[ii]), "\tin", box, only_centers) break if not check_if_textregion_located_in_a_box: dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_main[ii]], [cx_main[ii]]]), axis=0) @@ -2514,6 +2515,7 @@ class Eynollah: (boxes[:, 0] <= cx_main[ii]) & (cx_main[ii] < boxes[:, 1])) ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) arg_text_con_main[ii] = ind_min + #print("main/fallback", (mx_main[ii], Mx_main[ii], my_main[ii], My_main[ii]), "\tin", boxes[ind_min], only_centers) args_contours_main = np.arange(len(contours_only_text_parent)) order_by_con_main = np.zeros_like(arg_text_con_main) @@ -2531,6 +2533,7 @@ class Eynollah: My_head[ii] < box[3])): arg_text_con_head[ii] = jj check_if_textregion_located_in_a_box = True + #print("head/matched", (mx_head[ii], Mx_head[ii], my_head[ii], My_head[ii]), "\tin", box, only_centers) break if not check_if_textregion_located_in_a_box: dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_head[ii]], [cx_head[ii]]]), axis=0) @@ -2538,6 +2541,7 @@ class Eynollah: (boxes[:, 0] <= cx_head[ii]) & (cx_head[ii] < boxes[:, 1])) ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) arg_text_con_head[ii] = ind_min + #print("head/fallback", (mx_head[ii], Mx_head[ii], my_head[ii], My_head[ii]), "\tin", boxes[ind_min], only_centers) args_contours_head = np.arange(len(contours_only_text_parent_h)) order_by_con_head = np.zeros_like(arg_text_con_head) @@ -2587,7 +2591,7 @@ class Eynollah: try: results = match_boxes(False) except Exception as why: - self.logger.error(why) + self.logger.exception(why) results = match_boxes(True) self.logger.debug("exit do_order_of_regions") @@ -2976,7 +2980,7 @@ class Eynollah: max(self.num_col_lower or num_col_classifier, num_col_classifier)) except Exception as why: - self.logger.error(why) + self.logger.exception(why) num_col = None #print("inside graphics 3 ", time.time() - t_in_gr) return (num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, @@ -3044,7 +3048,7 @@ class Eynollah: if not num_column_is_classified: num_col_classifier = num_col + 1 except Exception as why: - self.logger.error(why) + self.logger.exception(why) num_col = None return (num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, text_regions_p_1, cont_page, table_prediction) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index b331cab..f1a8aae 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -33,226 +33,229 @@ def pairwise(iterable): a = b def return_x_start_end_mothers_childs_and_type_of_reading_order( - x_min_hor_some, x_max_hor_some, cy_hor_some, peak_points, cy_hor_diff): + x_min_hor_some, x_max_hor_some, cy_hor_some, peak_points, y_max_hor_some): + """ + Analyse which separators overlap multiple column candidates, + and how they overlap each other. + + Ignore separators not spanning multiple columns. + + For the separators to be returned, try to join them when they are directly + adjacent horizontally but nearby vertically (and thus mutually compatible). + Also, mark any separators that already span the full width. + + Furthermore, identify which pairs of (unjoined) separators span subsets of columns + of each other (disregarding vertical positions). Referring, respectively, to the + superset separators as "mothers" and to the subset separators as "children", + retrieve information on which columns are spanned by separators with no mother, + and which columns are spanned by their children (if any). + + Moreover, determine if there is any (column) overlap among the multi-span separators + with no mother, specifically (and thus, no simple box separation is possible). + + Arguments: + * the x start column index of the raw separators + * the x end column index of the raw separators + * the y center coordinate of the raw separators + * the x column coordinates + * the y end coordinate of the raw separators + + Returns: + a tuple of: + * whether any top-level (no-mother) multi-span separators overlap each other + * the x start column index of the resulting multi-span separators + * the x end column index of the resulting multi-span separators + * the y center coordinate of the resulting multi-span separators + * the y end coordinate of the resulting multi-span separators + * the y center (for 1 representative) of the top-level (no-mother) multi-span separators + * the x start column index of the top-level (no-mother) multi-span separators + * the x end column index of the top-level (no-mother) multi-span separators + * whether any multi-span separators have super-spans of other (child) multi-span separators + * the y center (for 1 representative) of the top-level (no-mother) multi-span separators + which have super-spans of other (child) multi-span separators + * the x start column index of the top-level multi-span separators + which have super-spans of other (child) multi-span separators + * the x end column index of the top-level multi-span separators + which have super-spans of other (child) multi-span separators + * indexes of multi-span separators with full-width span + """ x_start=[] x_end=[] - kind=[]#if covers 2 and more than 2 columns set it to 1 otherwise 0 len_sep=[] - y_sep=[] - y_diff=[] + y_mid=[] + y_max=[] new_main_sep_y=[] - indexer=0 for i in range(len(x_min_hor_some)): - starting=x_min_hor_some[i]-peak_points - starting=starting[starting>=0] - min_start=np.argmin(starting) - ending=peak_points-x_max_hor_some[i] - len_ending_neg=len(ending[ending<=0]) - - ending=ending[ending>0] - max_end=np.argmin(ending)+len_ending_neg + #print(indexer, "%d:%d" % (x_min_hor_some[i], x_max_hor_some[i]), cy_hor_some[i]) + starting = x_min_hor_some[i] - peak_points + min_start = np.flatnonzero(starting >= 0)[-1] # last left-of + ending = x_max_hor_some[i] - peak_points + max_end = np.flatnonzero(ending < 0)[0] # first right-of + #print(indexer, "%d:%d" % (min_start, max_end)) if (max_end-min_start)>=2: + # column range of separator spans more than one column candidate if (max_end-min_start)==(len(peak_points)-1): + # all columns (i.e. could be true new y splitter) new_main_sep_y.append(indexer) #print((max_end-min_start),len(peak_points),'(max_end-min_start)') - y_sep.append(cy_hor_some[i]) - y_diff.append(cy_hor_diff[i]) + y_mid.append(cy_hor_some[i]) + y_max.append(y_max_hor_some[i]) x_end.append(max_end) - - x_start.append( min_start) - + x_start.append(min_start) len_sep.append(max_end-min_start) - if max_end==min_start+1: - kind.append(0) - else: - kind.append(1) - indexer+=1 + #print(x_start,'x_start') + #print(x_end,'x_end') x_start_returned = np.array(x_start, dtype=int) x_end_returned = np.array(x_end, dtype=int) - y_sep_returned = np.array(y_sep, dtype=int) - y_diff_returned = np.array(y_diff, dtype=int) - - all_args_uniq = contours_in_same_horizon(y_sep_returned) - args_to_be_unified=[] - y_unified=[] - y_diff_unified=[] - x_s_unified=[] - x_e_unified=[] - if len(all_args_uniq)>0: - #print('burda') - if type(all_args_uniq[0]) is list: - for dd in range(len(all_args_uniq)): - if len(all_args_uniq[dd])==2: - x_s_same_hor=np.array(x_start_returned)[all_args_uniq[dd]] - x_e_same_hor=np.array(x_end_returned)[all_args_uniq[dd]] - y_sep_same_hor=np.array(y_sep_returned)[all_args_uniq[dd]] - y_diff_same_hor=np.array(y_diff_returned)[all_args_uniq[dd]] - #print('burda2') - if (x_s_same_hor[0]==x_e_same_hor[1]-1 or - x_s_same_hor[1]==x_e_same_hor[0]-1 and - x_s_same_hor[0]!=x_s_same_hor[1] and - x_e_same_hor[0]!=x_e_same_hor[1]): - #print('burda3') - for arg_in in all_args_uniq[dd]: - #print(arg_in,'arg_in') - args_to_be_unified.append(arg_in) - y_selected=np.min(y_sep_same_hor) - y_diff_selected=np.max(y_diff_same_hor) - x_s_selected=np.min(x_s_same_hor) - x_e_selected=np.max(x_e_same_hor) - - x_s_unified.append(x_s_selected) - x_e_unified.append(x_e_selected) - y_unified.append(y_selected) - y_diff_unified.append(y_diff_selected) - #print(x_s_same_hor,'x_s_same_hor') - #print(x_e_same_hor[:]-1,'x_e_same_hor') - #print('#############################') - #print(x_s_unified,'y_selected') - #print(x_e_unified,'x_s_selected') - #print(y_unified,'x_e_same_hor') - - args_lines_not_unified=list( set(range(len(y_sep_returned)))-set(args_to_be_unified) ) - #print(args_lines_not_unified,'args_lines_not_unified') - - x_start_returned_not_unified=list( np.array(x_start_returned)[args_lines_not_unified] ) - x_end_returned_not_unified=list( np.array(x_end_returned)[args_lines_not_unified] ) - y_sep_returned_not_unified=list (np.array(y_sep_returned)[args_lines_not_unified] ) - y_diff_returned_not_unified=list (np.array(y_diff_returned)[args_lines_not_unified] ) - - for dv in range(len(y_unified)): - y_sep_returned_not_unified.append(y_unified[dv]) - y_diff_returned_not_unified.append(y_diff_unified[dv]) - x_start_returned_not_unified.append(x_s_unified[dv]) - x_end_returned_not_unified.append(x_e_unified[dv]) - - #print(y_sep_returned,'y_sep_returned') + y_mid_returned = np.array(y_mid, dtype=int) + y_max_returned = np.array(y_max, dtype=int) + #print(y_mid_returned,'y_mid_returned') #print(x_start_returned,'x_start_returned') #print(x_end_returned,'x_end_returned') - x_start_returned = np.array(x_start_returned_not_unified, dtype=int) - x_end_returned = np.array(x_end_returned_not_unified, dtype=int) - y_sep_returned = np.array(y_sep_returned_not_unified, dtype=int) - y_diff_returned = np.array(y_diff_returned_not_unified, dtype=int) + # join/elongate separators if follow-up x and similar y + sep_pairs = contours_in_same_horizon(y_mid_returned) + if len(sep_pairs): + #print('burda') + args_to_be_unified = set() + y_mid_unified = [] + y_max_unified = [] + x_start_unified = [] + x_end_unified = [] + for pair in sep_pairs: + if (not np.array_equal(*x_start_returned[pair]) and + not np.array_equal(*x_end_returned[pair]) and + # immediately adjacent columns? + np.diff(x_end_returned[pair] - + x_start_returned[pair])[0] in [1, -1]): - #print(y_sep_returned,'y_sep_returned2') + args_to_be_unified.union(set(pair)) + y_mid_unified.append(np.min(y_mid_returned[pair])) + y_max_unified.append(np.max(y_max_returned[pair])) + x_start_unified.append(np.min(x_start_returned[pair])) + x_end_unified.append(np.max(x_end_returned[pair])) + #print(pair,'pair') + #print(x_start_returned[pair],'x_s_same_hor') + #print(x_end_returned[pair],'x_e_same_hor') + #print(y_mid_unified,'y_mid_unified') + #print(y_max_unified,'y_max_unified') + #print(x_start_unified,'x_s_unified') + #print(x_end_unified,'x_e_selected') + #print('#############################') + + if len(y_mid_unified): + args_lines_not_unified = np.setdiff1d(np.arange(len(y_mid_returned)), + list(args_to_be_unified), assume_unique=True) + #print(args_lines_not_unified,'args_lines_not_unified') + x_start_returned = np.append(x_start_returned[args_lines_not_unified], + x_start_unified, axis=0) + x_end_returned = np.append(x_end_returned[args_lines_not_unified], + x_end_unified, axis=0) + y_mid_returned = np.append(y_mid_returned[args_lines_not_unified], + y_mid_unified, axis=0) + y_max_returned = np.append(y_max_returned[args_lines_not_unified], + y_max_unified, axis=0) + #print(y_mid_returned,'y_mid_returned2') #print(x_start_returned,'x_start_returned2') #print(x_end_returned,'x_end_returned2') - #print(new_main_sep_y,'new_main_sep_y') + #print(new_main_sep_y,'new_main_sep_y') #print(x_start,'x_start') #print(x_end,'x_end') - if len(new_main_sep_y)>0: + x_start = np.array(x_start) + x_end = np.array(x_end) + y_mid = np.array(y_mid) + if len(new_main_sep_y): + # some full-width multi-span separators exist, so + # restrict the y range of separators to search for + # mutual overlaps to only those within the largest + # y strip between adjacent multi-span separators + # that involve at least one such full-width seps. + # (does not affect the separators to be returned) + min_ys=np.min(y_mid) + max_ys=np.max(y_mid) + #print(min_ys,'min_ys') + #print(max_ys,'max_ys') - min_ys=np.min(y_sep) - max_ys=np.max(y_sep) + y_mains0 = list(y_mid[new_main_sep_y]) + y_mains = [min_ys] + y_mains0 + [max_ys] - y_mains=[] - y_mains.append(min_ys) - y_mains_sep_ohne_grenzen=[] + y_mains = np.sort(y_mains) + argm = np.argmax(np.diff(y_mains)) + y_mid_new = y_mains[argm] + y_mid_next_new = y_mains[argm + 1] - for ii in range(len(new_main_sep_y)): - y_mains.append(y_sep[new_main_sep_y[ii]]) - y_mains_sep_ohne_grenzen.append(y_sep[new_main_sep_y[ii]]) - - y_mains.append(max_ys) - - y_mains_sorted=np.sort(y_mains) - diff=np.diff(y_mains_sorted) - argm=np.argmax(diff) - - y_min_new=y_mains_sorted[argm] - y_max_new=y_mains_sorted[argm+1] - - #print(y_min_new,'y_min_new') - #print(y_max_new,'y_max_new') - #print(y_sep[new_main_sep_y[0]],y_sep,'yseps') + #print(y_mid_new,argm,'y_mid_new') + #print(y_mid_next_new,argm+1,'y_mid_next_new') + #print(y_mid[new_main_sep_y],new_main_sep_y,'yseps') x_start=np.array(x_start) x_end=np.array(x_end) - kind=np.array(kind) - y_sep=np.array(y_sep) - if (y_min_new in y_mains_sep_ohne_grenzen and - y_max_new in y_mains_sep_ohne_grenzen): - x_start=x_start[(y_sep>y_min_new) & (y_sepy_min_new) & (y_sepy_min_new) & (y_sepy_min_new) & (y_sepy_min_new) & (y_sep<=y_max_new)] - #print('burda1') - x_end=x_end[(y_sep>y_min_new) & (y_sep<=y_max_new)] - #print('burda2') - kind=kind[(y_sep>y_min_new) & (y_sep<=y_max_new)] - y_sep=y_sep[(y_sep>y_min_new) & (y_sep<=y_max_new)] - elif (y_min_new not in y_mains_sep_ohne_grenzen and - y_max_new in y_mains_sep_ohne_grenzen): - x_start=x_start[(y_sep>=y_min_new) & (y_sep=y_min_new) & (y_sep=y_min_new) & (y_sep=y_min_new) & (y_sep y_mid_new else: - x_start=x_start[(y_sep>=y_min_new) & (y_sep<=y_max_new)] - x_end=x_end[(y_sep>=y_min_new) & (y_sep<=y_max_new)] - kind=kind[(y_sep>=y_min_new) & (y_sep<=y_max_new)] - y_sep=y_sep[(y_sep>=y_min_new) & (y_sep<=y_max_new)] + where = y_mid >= y_mid_new + if y_mid_next_new in y_mains0: + where &= y_mid < y_mid_next_new + else: + where &= y_mid <= y_mid_next_new + x_start = x_start[where] + x_end = x_end[where] + y_mid = y_mid[where] #print(x_start,'x_start') #print(x_end,'x_end') - #print(len_sep) + # remove redundant separators that span the same columns + # (keeping only 1 representative each) deleted = set() - for i in range(len(x_start)-1): - nodes_i=set(range(x_start[i],x_end[i]+1)) - for j in range(i+1,len(x_start)): - if nodes_i==set(range(x_start[j],x_end[j]+1)): - deleted.add(j) - #print(np.unique(deleted)) - + for index_i in range(len(x_start) - 1): + nodes_i = set(range(x_start[index_i], x_end[index_i] + 1)) + #print(nodes_i, "nodes_i") + for index_j in range(index_i + 1, len(x_start)): + nodes_j = set(range(x_start[index_j], x_end[index_j] + 1)) + #print(nodes_j, "nodes_j") + if nodes_i == nodes_j: + deleted.add(index_j) + #print(deleted,"deleted") remained_sep_indexes = set(range(len(x_start))) - deleted #print(remained_sep_indexes,'remained_sep_indexes') - mother=[]#if it has mother - child=[] + + # determine which separators span which columns + mother = [] # whether the respective separator has a mother separator + child = [] # whether the respective separator has a child separator for index_i in remained_sep_indexes: have_mother=0 have_child=0 - nodes_ind=set(range(x_start[index_i],x_end[index_i]+1)) + nodes_i = set(range(x_start[index_i], x_end[index_i] + 1)) for index_j in remained_sep_indexes: - nodes_ind_j=set(range(x_start[index_j],x_end[index_j]+1)) - if nodes_indnodes_ind_j: + if nodes_i > nodes_j: have_child=1 mother.append(have_mother) child.append(have_child) - - #print(mother,'mother') - #print(len(remained_sep_indexes)) - #print(len(remained_sep_indexes),len(x_start),len(x_end),len(y_sep),'lens') - y_lines_without_mother=[] - x_start_without_mother=[] - x_end_without_mother=[] - - y_lines_with_child_without_mother=[] - x_start_with_child_without_mother=[] - x_end_with_child_without_mother=[] + #print(mother, "mother") + #print(child, "child") mother = np.array(mother) child = np.array(child) #print(mother,'mother') #print(child,'child') remained_sep_indexes = np.array(list(remained_sep_indexes)) - x_start = np.array(x_start) - x_end = np.array(x_end) - y_sep = np.array(y_sep) + #print(len(remained_sep_indexes)) + #print(len(remained_sep_indexes),len(x_start),len(x_end),len(y_mid),'lens') - if len(remained_sep_indexes)>1: + reading_order_type = 0 + if len(remained_sep_indexes): #print(np.array(remained_sep_indexes),'np.array(remained_sep_indexes)') #print(np.array(mother),'mother') remained_sep_indexes_without_mother = remained_sep_indexes[mother==0] @@ -262,52 +265,53 @@ def return_x_start_end_mothers_childs_and_type_of_reading_order( x_end_with_child_without_mother = x_end[remained_sep_indexes_with_child_without_mother] x_start_with_child_without_mother = x_start[remained_sep_indexes_with_child_without_mother] - y_lines_with_child_without_mother = y_sep[remained_sep_indexes_with_child_without_mother] + y_mid_with_child_without_mother = y_mid[remained_sep_indexes_with_child_without_mother] - reading_order_type=0 x_end_without_mother = x_end[remained_sep_indexes_without_mother] x_start_without_mother = x_start[remained_sep_indexes_without_mother] - y_lines_without_mother = y_sep[remained_sep_indexes_without_mother] + y_mid_without_mother = y_mid[remained_sep_indexes_without_mother] if len(remained_sep_indexes_without_mother)>=2: for i in range(len(remained_sep_indexes_without_mother)-1): - nodes_i=set(range(x_start[remained_sep_indexes_without_mother[i]], - x_end[remained_sep_indexes_without_mother[i]] - # + 1 - )) - for j in range(i+1,len(remained_sep_indexes_without_mother)): - nodes_j=set(range(x_start[remained_sep_indexes_without_mother[j]], - x_end[remained_sep_indexes_without_mother[j]] - # + 1 - )) + index_i = remained_sep_indexes_without_mother[i] + nodes_i = set(range(x_start[index_i], x_end[index_i])) # + 1 + #print(index_i, nodes_i, "nodes_i without mother") + for j in range(i + 1, len(remained_sep_indexes_without_mother)): + index_j = remained_sep_indexes_without_mother[j] + nodes_j = set(range(x_start[index_j], x_end[index_j])) # + 1 + #print(index_j, nodes_j, "nodes_j without mother") if nodes_i - nodes_j != nodes_i: + #print("type=1") reading_order_type = 1 else: - reading_order_type = 0 - #print(reading_order_type,'javab') - #print(y_lines_with_child_without_mother,'y_lines_with_child_without_mother') + y_mid_without_mother = np.zeros(0, int) + x_start_without_mother = np.zeros(0, int) + x_end_without_mother = np.zeros(0, int) + y_mid_with_child_without_mother = np.zeros(0, int) + x_start_with_child_without_mother = np.zeros(0, int) + x_end_with_child_without_mother = np.zeros(0, int) + + #print(reading_order_type,'reading_order_type') + #print(y_mid_with_child_without_mother,'y_mid_with_child_without_mother') #print(x_start_with_child_without_mother,'x_start_with_child_without_mother') #print(x_end_with_child_without_mother,'x_end_with_hild_without_mother') len_sep_with_child = len(child[child==1]) - #print(len_sep_with_child,'len_sep_with_child') there_is_sep_with_child = 0 if len_sep_with_child >= 1: there_is_sep_with_child = 1 - #print(all_args_uniq,'all_args_uniq') - #print(args_to_be_unified,'args_to_be_unified') return (reading_order_type, x_start_returned, x_end_returned, - y_sep_returned, - y_diff_returned, - y_lines_without_mother, + y_mid_returned, + y_max_returned, + y_mid_without_mother, x_start_without_mother, x_end_without_mother, there_is_sep_with_child, - y_lines_with_child_without_mother, + y_mid_with_child_without_mother, x_start_with_child_without_mother, x_end_with_child_without_mother, new_main_sep_y) From b2a79cc6ed766cef5074629fcb76ae1c6846f084 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 24 Oct 2025 01:31:52 +0200 Subject: [PATCH 012/118] `return_x_start_end_mothers_childs_and_type_of_reading_order`: fix+1 when calculating `reading_order_type`, upper limit on column range (`x_end`) needs to be `+1` here as well --- src/eynollah/utils/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index f1a8aae..3a383e9 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -274,11 +274,11 @@ def return_x_start_end_mothers_childs_and_type_of_reading_order( if len(remained_sep_indexes_without_mother)>=2: for i in range(len(remained_sep_indexes_without_mother)-1): index_i = remained_sep_indexes_without_mother[i] - nodes_i = set(range(x_start[index_i], x_end[index_i])) # + 1 + nodes_i = set(range(x_start[index_i], x_end[index_i] + 1)) #print(index_i, nodes_i, "nodes_i without mother") for j in range(i + 1, len(remained_sep_indexes_without_mother)): index_j = remained_sep_indexes_without_mother[j] - nodes_j = set(range(x_start[index_j], x_end[index_j])) # + 1 + nodes_j = set(range(x_start[index_j], x_end[index_j] + 1)) #print(index_j, nodes_j, "nodes_j without mother") if nodes_i - nodes_j != nodes_i: #print("type=1") From acee4c1bfe227055194050935f1868d1fb156701 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 24 Oct 2025 01:43:41 +0200 Subject: [PATCH 013/118] `find_number_of_columns_in_document`: simplify --- src/eynollah/utils/__init__.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 3a383e9..f948de2 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1551,23 +1551,23 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, (x_max_head>=.84*region_pre_p.shape[1])] cy_seps_splitters = np.append(cy_seps_splitters, cy_seps_splitters_head) - cy_seps_splitters = np.sort(cy_seps_splitters) + cy_seps_splitters = np.sort(cy_seps_splitters).astype(int) splitter_y_new = [0] + list(cy_seps_splitters) + [region_pre_p.shape[0]] - splitter_y_new_diff = np.diff(splitter_y_new) / float(region_pre_p.shape[0]) * 100 - - args_big_parts=np.arange(len(splitter_y_new_diff))[ splitter_y_new_diff>22 ] + big_part = 22 * region_pre_p.shape[0] // 100 # percent height regions_without_separators=return_regions_without_separators(region_pre_p) - length_y_threshold=regions_without_separators.shape[0]/4.0 num_col_fin=0 peaks_neg_fin_fin=[] - for itiles in args_big_parts: - regions_without_separators_tile=regions_without_separators[int(splitter_y_new[itiles]): - int(splitter_y_new[itiles+1]),:] + num_big_parts = 0 + for top, bot in pairwise(splitter_y_new): + if bot - top < big_part: + continue + num_big_parts += 1 try: - num_col, peaks_neg_fin = find_num_col(regions_without_separators_tile, + num_col, peaks_neg_fin = find_num_col(regions_without_separators[top: bot], num_col_classifier, tables, multiplier=7.0) + #print("big part %d:%d has %d columns" % (top, bot, num_col), peaks_neg_fin) except: num_col = 0 peaks_neg_fin = [] @@ -1575,7 +1575,7 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, num_col_fin=num_col peaks_neg_fin_fin=peaks_neg_fin - if len(args_big_parts)==1 and (len(peaks_neg_fin_fin)+1)=500] peaks_neg_fin=peaks_neg_fin[peaks_neg_fin<=(vertical.shape[1]-500)] From 5d15941b350841a4490e002c92ff89a5f6113905 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 24 Oct 2025 01:51:59 +0200 Subject: [PATCH 014/118] `contours_in_same_horizon`: simplify - array instead of list operations - return array of index pairs instead of list objects --- src/eynollah/utils/__init__.py | 73 ++++++++++++++++------------------ src/eynollah/utils/contour.py | 25 +++++------- 2 files changed, 44 insertions(+), 54 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index f948de2..10987ad 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1315,47 +1315,42 @@ def combine_hor_lines_and_delete_cross_points_and_get_lines_features_back_new( float(num_col_classifier)) if len_lines_bigger_than_x_width_smaller_than_acolumn_width_per_column < 10: args_hor=np.arange(len(slope_lines_hor)) - all_args_uniq=contours_in_same_horizon(cy_main_hor) - #print(all_args_uniq,'all_args_uniq') - if len(all_args_uniq)>0: - if type(all_args_uniq[0]) is list: - special_separators=[] - contours_new=[] - for dd in range(len(all_args_uniq)): - merged_all=None - some_args=args_hor[all_args_uniq[dd]] - some_cy=cy_main_hor[all_args_uniq[dd]] - some_x_min=x_min_main_hor[all_args_uniq[dd]] - some_x_max=x_max_main_hor[all_args_uniq[dd]] + sep_pairs=contours_in_same_horizon(cy_main_hor) + if len(sep_pairs): + special_separators=[] + contours_new=[] + for pair in sep_pairs: + merged_all=None + some_args=args_hor[pair] + some_cy=cy_main_hor[pair] + some_x_min=x_min_main_hor[pair] + some_x_max=x_max_main_hor[pair] - #img_in=np.zeros(separators_closeup_n[:,:,2].shape) - #print(img_p_in_ver.shape[1],some_x_max-some_x_min,'xdiff') - diff_x_some=some_x_max-some_x_min - for jv in range(len(some_args)): - img_p_in=cv2.fillPoly(img_in_hor, pts=[contours_lines_hor[some_args[jv]]], color=(1,1,1)) - if any(i_diff>(img_p_in_ver.shape[1]/float(3.3)) for i_diff in diff_x_some): - img_p_in[int(np.mean(some_cy))-5: - int(np.mean(some_cy))+5, - int(np.min(some_x_min)): - int(np.max(some_x_max)) ]=1 - sum_dis=dist_x_hor[some_args].sum() - diff_max_min_uniques=np.max(x_max_main_hor[some_args])-np.min(x_min_main_hor[some_args]) + #img_in=np.zeros(separators_closeup_n[:,:,2].shape) + #print(img_p_in_ver.shape[1],some_x_max-some_x_min,'xdiff') + diff_x_some=some_x_max-some_x_min + for jv in range(len(some_args)): + img_p_in=cv2.fillPoly(img_in_hor, pts=[contours_lines_hor[some_args[jv]]], color=(1,1,1)) + if any(i_diff>(img_p_in_ver.shape[1]/float(3.3)) for i_diff in diff_x_some): + img_p_in[int(np.mean(some_cy))-5: + int(np.mean(some_cy))+5, + int(np.min(some_x_min)): + int(np.max(some_x_max)) ]=1 + sum_dis=dist_x_hor[some_args].sum() + diff_max_min_uniques=np.max(x_max_main_hor[some_args])-np.min(x_min_main_hor[some_args]) - if (diff_max_min_uniques > sum_dis and - sum_dis / float(diff_max_min_uniques) > 0.85 and - diff_max_min_uniques / float(img_p_in_ver.shape[1]) > 0.85 and - np.std(dist_x_hor[some_args]) < 0.55 * np.mean(dist_x_hor[some_args])): - # print(dist_x_hor[some_args], - # dist_x_hor[some_args].sum(), - # np.min(x_min_main_hor[some_args]), - # np.max(x_max_main_hor[some_args]),'jalibdi') - # print(np.mean( dist_x_hor[some_args] ), - # np.std( dist_x_hor[some_args] ), - # np.var( dist_x_hor[some_args] ),'jalibdiha') - special_separators.append(np.mean(cy_main_hor[some_args])) - else: - img_p_in=img_in_hor - special_separators=[] + if (diff_max_min_uniques > sum_dis and + sum_dis / float(diff_max_min_uniques) > 0.85 and + diff_max_min_uniques / float(img_p_in_ver.shape[1]) > 0.85 and + np.std(dist_x_hor[some_args]) < 0.55 * np.mean(dist_x_hor[some_args])): + # print(dist_x_hor[some_args], + # dist_x_hor[some_args].sum(), + # np.min(x_min_main_hor[some_args]), + # np.max(x_max_main_hor[some_args]),'jalibdi') + # print(np.mean( dist_x_hor[some_args] ), + # np.std( dist_x_hor[some_args] ), + # np.var( dist_x_hor[some_args] ),'jalibdiha') + special_separators.append(np.mean(cy_main_hor[some_args])) else: img_p_in=img_in_hor special_separators=[] diff --git a/src/eynollah/utils/contour.py b/src/eynollah/utils/contour.py index f304db2..052688c 100644 --- a/src/eynollah/utils/contour.py +++ b/src/eynollah/utils/contour.py @@ -14,21 +14,16 @@ from shapely.ops import unary_union, nearest_points from .rotate import rotate_image, rotation_image_new def contours_in_same_horizon(cy_main_hor): - X1 = np.zeros((len(cy_main_hor), len(cy_main_hor))) - X2 = np.zeros((len(cy_main_hor), len(cy_main_hor))) - - X1[0::1, :] = cy_main_hor[:] - X2 = X1.T - - X_dif = np.abs(X2 - X1) - args_help = np.array(range(len(cy_main_hor))) - all_args = [] - for i in range(len(cy_main_hor)): - list_h = list(args_help[X_dif[i, :] <= 20]) - list_h.append(i) - if len(list_h) > 1: - all_args.append(list(set(list_h))) - return np.unique(np.array(all_args, dtype=object)) + """ + Takes an array of y coords, identifies all pairs among them + which are close to each other, and returns all such pairs + by index into the array. + """ + sort = np.argsort(cy_main_hor) + same = np.diff(cy_main_hor[sort] <= 20) + # groups = np.split(sort, np.arange(len(cy_main_hor) - 1)[~same] + 1) + same = np.flatnonzero(same) + return np.stack((sort[:-1][same], sort[1:][same])).T def find_contours_mean_y_diff(contours_main): M_main = [cv2.moments(contours_main[j]) for j in range(len(contours_main))] From 6cc5900943d5395adbbbea737871413bf10b9ccf Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 24 Oct 2025 01:55:07 +0200 Subject: [PATCH 015/118] `find_num_col`: add better plotting (but commented out) --- src/eynollah/utils/__init__.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 10987ad..4046396 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -485,9 +485,12 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl # print(peaks_neg,'peaks_neg') # fig, (ax1, ax2) = plt.subplots(2, sharex=True) # ax1.imshow(regions_without_separators, aspect="auto") - # ax2.plot(z) - # ax2.scatter(peaks_neg, z[peaks_neg]) - # ax2.axhline(grenze, label="grenze") + # ax2.plot(z, color='red', label='z') + # ax2.plot(zneg[20:], color='blue', label='zneg') + # ax2.scatter(peaks_neg, z[peaks_neg], color='red') + # ax2.scatter(peaks_neg, zneg[20:][peaks_neg], color='blue') + # ax2.axhline(min_peaks_pos, color='red', label="min_peaks_pos") + # ax2.axhline(grenze, color='blue', label="grenze") # ax2.text(0, grenze, "grenze") # plt.show() @@ -816,6 +819,12 @@ def find_num_col_by_vertical_lines(regions_without_separators, multiplier=3.8): peaks, _ = find_peaks(z, height=0) # print(peaks,'peaksnew') + # fig, (ax1, ax2) = plt.subplots(2, sharex=True, suptitle='find_num_col_by_vertical_lines') + # ax1.imshow(regions_without_separators, aspect="auto") + # ax2.plot(z) + # ax2.scatter(peaks, z[peaks]) + # ax2.set_title('find_peaks(regions_without_separators.sum(axis=0), height=0)') + # plt.show() return peaks def return_regions_without_separators(regions_pre): From 6fbb5f8a12185192f7d9db7b008c3ef8b5f24d33 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 24 Oct 2025 02:02:39 +0200 Subject: [PATCH 016/118] `return_boxes_of_images_by_order_of_reading_new`: simplify MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - array instead of list operations - add better plotting (but commented out) - add more debug printing (but commented out) - add more inline comments for documentation - rename identifiers to make more readable: - `cy_hor_diff` → `y_max_hor_some` (because the ymax gets passed) - `lines` → `seps` - `y_type_2` → `y_mid` - `y_diff_type_2` → `y_max` - `y_lines_by_order` → `y_mid_by_order` - `y_lines_without_mother` → `y_mid_without_mother` - `y_lines_with_child_without_mother` → `y_mid_with_child_without_mother` - `y_column` → `y_mid_column` - `y_column_nc` → `y_mid_column_nc` - `y_all_between_nm_wc` → `y_mid_between_nm_wc` - `lines_so_close_to_top_separator` → `seps_too_close_to_top_separator` - `y_in_cols` and `y_down` → `y_mid_next` - use `pairwise()` `nc_top:nc_bot` instead of `i_c` indexing --- src/eynollah/utils/__init__.py | 480 +++++++++++++++++---------------- 1 file changed, 247 insertions(+), 233 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 4046396..eca96f3 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1599,19 +1599,31 @@ def return_boxes_of_images_by_order_of_reading_new( if logger is None: logger = getLogger(__package__) logger.debug('enter return_boxes_of_images_by_order_of_reading_new') - # def dbg_plt(box=None, title=None): - # if box is None: - # box = [None, None, None, None] - # img = regions_without_separators[box[2]:box[3], box[0]:box[1]] + + # def dbg_plt(box=None, title=None, rectangles=None, rectangles_showidx=False): + # minx, maxx, miny, maxy = box or (0, None, 0, None) + # img = regions_without_separators[miny:maxy, minx:maxx] # plt.imshow(img) # xrange = np.arange(0, img.shape[1], 100) # yrange = np.arange(0, img.shape[0], 100) - # plt.gca().set_xticks(xrange, xrange + (box[0] or 0)) - # plt.gca().set_yticks(yrange, yrange + (box[2] or 0)) + # ax = plt.gca() + # ax.set_xticks(xrange) + # ax.set_yticks(yrange) + # ax.set_xticklabels(xrange + minx) + # ax.set_yticklabels(yrange + miny) + # def format_coord(x, y): + # return 'x={:g}, y={:g}'.format(x + minx, y + miny) + # ax.format_coord = format_coord # if title: # plt.title(title) + # if rectangles: + # for i, (xmin, xmax, ymin, ymax) in enumerate(rectangles): + # ax.add_patch(patches.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, + # fill=False, linewidth=1, edgecolor='r')) + # if rectangles_showidx: + # ax.text((xmin+xmax)/2, (ymin+ymax)/2, str(i + 1), c='r') # plt.show() - # dbg_plt() + # dbg_plt(title="return_boxes_of_images_by_order_of_reading_new") boxes=[] peaks_neg_tot_tables = [] @@ -1619,9 +1631,7 @@ def return_boxes_of_images_by_order_of_reading_new( width_tot = regions_without_separators.shape[1] for top, bot in pairwise(splitter_y_new): # print("%d:%d" % (top, bot), 'i') - # dbg_plt([None, None, top, bot], - # "image cut for y split %d:%d" % ( - # top, bot)) + # dbg_plt([0, None, top, bot], "image cut for y split %d:%d" % (top, bot)) matrix_new = matrix_of_lines_ch[(matrix_of_lines_ch[:,6] > top) & (matrix_of_lines_ch[:,7] < bot)] #print(len( matrix_new[:,9][matrix_new[:,9]==1] )) @@ -1677,20 +1687,21 @@ def return_boxes_of_images_by_order_of_reading_new( peaks_neg_fin = peaks_neg_fin1 else: peaks_neg_fin = peaks_neg_fin2 + # add offset to local result peaks_neg_fin = list(np.array(peaks_neg_fin) + left) #print(peaks_neg_fin,'peaks_neg_fin') - if right < peaks_neg_fin_early[-1]: - peaks_neg_fin_rev.append(right) peaks_neg_fin_rev.extend(peaks_neg_fin) + if right < peaks_neg_fin_early[-1]: + # all but the last column: interject the preexisting boundary + peaks_neg_fin_rev.append(right) + #print(peaks_neg_fin_rev,'peaks_neg_fin_rev') - if len(peaks_neg_fin_rev)>=len(peaks_neg_fin_org): - peaks_neg_fin=list(np.sort(peaks_neg_fin_rev)) - num_col=len(peaks_neg_fin) + if len(peaks_neg_fin_rev) >= len(peaks_neg_fin_org): + peaks_neg_fin = peaks_neg_fin_rev else: - peaks_neg_fin=list(np.copy(peaks_neg_fin_org)) - num_col=len(peaks_neg_fin) - + peaks_neg_fin = peaks_neg_fin_org + num_col = len(peaks_neg_fin) #print(peaks_neg_fin,'peaks_neg_fin') except: logger.exception("cannot find peaks consistent with columns") @@ -1700,7 +1711,7 @@ def return_boxes_of_images_by_order_of_reading_new( x_min_hor_some=matrix_new[:,2][ (matrix_new[:,9]==0) ] x_max_hor_some=matrix_new[:,3][ (matrix_new[:,9]==0) ] cy_hor_some=matrix_new[:,5][ (matrix_new[:,9]==0) ] - cy_hor_diff=matrix_new[:,7][ (matrix_new[:,9]==0) ] + y_max_hor_some=matrix_new[:,7][ (matrix_new[:,9]==0) ] if right2left_readingorder: x_max_hor_some_new = width_tot - x_min_hor_some @@ -1708,136 +1719,121 @@ def return_boxes_of_images_by_order_of_reading_new( x_min_hor_some =list(np.copy(x_min_hor_some_new)) x_max_hor_some =list(np.copy(x_max_hor_some_new)) - peaks_neg_tot = [0] + peaks_neg_fin + [width_tot] + peaks_neg_tot = np.array([0] + peaks_neg_fin + [width_tot]) + #print(peaks_neg_tot,'peaks_neg_tot') peaks_neg_tot_tables.append(peaks_neg_tot) - reading_order_type, x_starting, x_ending, y_type_2, y_diff_type_2, \ - y_lines_without_mother, x_start_without_mother, x_end_without_mother, there_is_sep_with_child, \ - y_lines_with_child_without_mother, x_start_with_child_without_mother, x_end_with_child_without_mother, \ - new_main_sep_y = return_x_start_end_mothers_childs_and_type_of_reading_order( - x_min_hor_some, x_max_hor_some, cy_hor_some, peaks_neg_tot, cy_hor_diff) - all_columns = set(range(len(peaks_neg_tot) - 1)) - # print("all_columns", all_columns) + #print("all_columns", all_columns) + + reading_order_type, x_starting, x_ending, y_mid, y_max, \ + y_mid_without_mother, x_start_without_mother, x_end_without_mother, \ + there_is_sep_with_child, \ + y_mid_with_child_without_mother, x_start_with_child_without_mother, x_end_with_child_without_mother, \ + new_main_sep_y = return_x_start_end_mothers_childs_and_type_of_reading_order( + x_min_hor_some, x_max_hor_some, cy_hor_some, peaks_neg_tot, y_max_hor_some) + + # show multi-column separators + # dbg_plt([0, None, top, bot], "multi-column separators in current split", + # list(zip(peaks_neg_tot[x_starting], peaks_neg_tot[x_ending], + # y_mid - top, y_max - top)), True) + if (reading_order_type == 1 or - len(y_lines_without_mother) >= 2 or + len(y_mid_without_mother) >= 2 or there_is_sep_with_child == 1): + # there are top-level multi-colspan horizontal separators which overlap each other + # or multiple top-level multi-colspan horizontal separators + # or multi-colspan horizontal separators shorter than their respective top-level: + # todo: explain how this is dealt with try: y_grenze = top + 300 - #check if there is a big separator in this y_mains_sep_ohne_grenzen + up = (y_mid > top) & (y_mid <= y_grenze) - args_early_ys=np.arange(len(y_type_2)) + args_early_ys=np.arange(len(y_mid)) #print(args_early_ys,'args_early_ys') - #print(top, bot) + #print(y_mid,'y_mid') - x_starting_up = x_starting[(y_type_2 > top) & - (y_type_2 <= y_grenze)] - x_ending_up = x_ending[(y_type_2 > top) & - (y_type_2 <= y_grenze)] - y_type_2_up = y_type_2[(y_type_2 > top) & - (y_type_2 <= y_grenze)] - y_diff_type_2_up = y_diff_type_2[(y_type_2 > top) & - (y_type_2 <= y_grenze)] - args_up = args_early_ys[(y_type_2 > top) & - (y_type_2 <= y_grenze)] - if len(y_type_2_up) > 0: - y_main_separator_up = y_type_2_up [(x_starting_up==0) & - (x_ending_up==(len(peaks_neg_tot)-1) )] - y_diff_main_separator_up = y_diff_type_2_up[(x_starting_up==0) & - (x_ending_up==(len(peaks_neg_tot)-1) )] - args_main_to_deleted = args_up[(x_starting_up==0) & - (x_ending_up==(len(peaks_neg_tot)-1) )] - #print(y_main_separator_up,y_diff_main_separator_up,args_main_to_deleted,'fffffjammmm') - if len(y_diff_main_separator_up) > 0: + x_starting_up = x_starting[up] + x_ending_up = x_ending[up] + y_mid_up = y_mid[up] + y_max_up = y_max[up] + args_up = args_early_ys[up] + #print(args_up,'args_up') + #print(y_mid_up,'y_mid_up') + #check if there is a big separator in this y_mains0 + if len(y_mid_up) > 0: + # is there a separator with full-width span? + main_separator = (x_starting_up == 0) & (x_ending_up == len(peaks_neg_tot) - 1) + y_mid_main_separator_up = y_mid_up[main_separator] + y_max_main_separator_up = y_max_up[main_separator] + args_main_to_deleted = args_up[main_separator] + #print(y_mid_main_separator_up,y_max_main_separator_up,args_main_to_deleted,'fffffjammmm') + if len(y_max_main_separator_up): args_to_be_kept = np.array(list( set(args_early_ys) - set(args_main_to_deleted) )) #print(args_to_be_kept,'args_to_be_kept') - boxes.append([0, peaks_neg_tot[len(peaks_neg_tot)-1], - top, y_diff_main_separator_up.max()]) - # dbg_plt(boxes[-1], "first box") - top = y_diff_main_separator_up.max() + boxes.append([0, peaks_neg_tot[-1], + top, y_max_main_separator_up.max()]) + # dbg_plt(boxes[-1], "near top main separator box") + top = y_max_main_separator_up.max() #print(top,'top') - y_type_2 = y_type_2[args_to_be_kept] + y_mid = y_mid[args_to_be_kept] x_starting = x_starting[args_to_be_kept] x_ending = x_ending[args_to_be_kept] - y_diff_type_2 = y_diff_type_2[args_to_be_kept] + y_max = y_max[args_to_be_kept] #print('galdiha') y_grenze = top + 200 - args_early_ys2=np.arange(len(y_type_2)) - y_type_2_up=y_type_2[(y_type_2 > top) & - (y_type_2 <= y_grenze)] - x_starting_up=x_starting[(y_type_2 > top) & - (y_type_2 <= y_grenze)] - x_ending_up=x_ending[(y_type_2 > top) & - (y_type_2 <= y_grenze)] - y_diff_type_2_up=y_diff_type_2[(y_type_2 > top) & - (y_type_2 <= y_grenze)] - args_up2=args_early_ys2[(y_type_2 > top) & - (y_type_2 <= y_grenze)] - #print(y_type_2_up,x_starting_up,x_ending_up,'didid') - nodes_in = set() - for ij in range(len(x_starting_up)): - nodes_in.update(range(x_starting_up[ij], - x_ending_up[ij])) - #print(nodes_in,'nodes_in') + up = (y_mid > top) & (y_mid <= y_grenze) + args_early_ys2 = np.arange(len(y_mid)) + x_starting_up = x_starting[up] + x_ending_up = x_ending[up] + y_mid_up = y_mid[up] + y_max_up = y_max[up] + args_up2 = args_early_ys2[up] + #print(y_mid_up,x_starting_up,x_ending_up,'didid') + else: + args_early_ys2 = args_early_ys + args_up2 = args_up - if nodes_in == set(range(len(peaks_neg_tot)-1)): - pass - elif nodes_in == set(range(1, len(peaks_neg_tot)-1)): - pass - else: - #print('burdaydikh') - args_to_be_kept2=np.array(list( set(args_early_ys2)-set(args_up2) )) + nodes_in = set() + for ij in range(len(x_starting_up)): + nodes_in.update(range(x_starting_up[ij], + x_ending_up[ij])) + #print(nodes_in,'nodes_in') + #print(np.array(range(len(peaks_neg_tot)-1)),'np.array(range(len(peaks_neg_tot)-1))') - if len(args_to_be_kept2)>0: - y_type_2 = y_type_2[args_to_be_kept2] - x_starting = x_starting[args_to_be_kept2] - x_ending = x_ending[args_to_be_kept2] - y_diff_type_2 = y_diff_type_2[args_to_be_kept2] - else: - pass - #print('burdaydikh2') - elif len(y_diff_main_separator_up)==0: - nodes_in = set() - for ij in range(len(x_starting_up)): - nodes_in.update(range(x_starting_up[ij], - x_ending_up[ij])) - #print(nodes_in,'nodes_in2') - #print(np.array(range(len(peaks_neg_tot)-1)),'np.array(range(len(peaks_neg_tot)-1))') + if nodes_in == set(range(len(peaks_neg_tot)-1)): + pass + elif nodes_in == set(range(1, len(peaks_neg_tot)-1)): + pass + else: + #print('burdaydikh') + args_to_be_kept2 = np.array(list( set(args_early_ys2) - set(args_up2) )) - if nodes_in == set(range(len(peaks_neg_tot)-1)): - pass - elif nodes_in == set(range(1,len(peaks_neg_tot)-1)): - pass - else: - #print('burdaydikh') - #print(args_early_ys,'args_early_ys') - #print(args_up,'args_up') - args_to_be_kept2=np.array(list( set(args_early_ys) - set(args_up) )) - - #print(args_to_be_kept2,'args_to_be_kept2') - #print(len(y_type_2),len(x_starting),len(x_ending),len(y_diff_type_2)) - if len(args_to_be_kept2)>0: - y_type_2 = y_type_2[args_to_be_kept2] - x_starting = x_starting[args_to_be_kept2] - x_ending = x_ending[args_to_be_kept2] - y_diff_type_2 = y_diff_type_2[args_to_be_kept2] - else: - pass - #print('burdaydikh2') + if len(args_to_be_kept2): + #print(args_to_be_kept2, "args_to_be_kept2") + y_mid = y_mid[args_to_be_kept2] + x_starting = x_starting[args_to_be_kept2] + x_ending = x_ending[args_to_be_kept2] + y_max = y_max[args_to_be_kept2] #int(top) - y_lines_by_order=[] + # order multi-column separators + y_mid_by_order=[] x_start_by_order=[] x_end_by_order=[] if (reading_order_type == 1 or len(x_end_with_child_without_mother) == 0): if reading_order_type == 1: - y_lines_by_order.append(top) + # there are top-level multi-colspan horizontal separators which overlap each other + #print("adding all columns at top because of multiple overlapping mothers") + y_mid_by_order.append(top) x_start_by_order.append(0) x_end_by_order.append(len(peaks_neg_tot)-2) else: + # there are no top-level multi-colspan horizontal separators which themselves + # contain shorter multi-colspan separators #print(x_start_without_mother,x_end_without_mother,peaks_neg_tot,'dodo') columns_covered_by_mothers = set() for dj in range(len(x_start_without_mother)): @@ -1845,31 +1841,32 @@ def return_boxes_of_images_by_order_of_reading_new( range(x_start_without_mother[dj], x_end_without_mother[dj])) columns_not_covered = list(all_columns - columns_covered_by_mothers) - y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered) + - len(x_start_without_mother), - dtype=int) * top) - ##y_lines_by_order = np.append(y_lines_by_order, [top] * len(columns_not_covered)) + #print(columns_covered_by_mothers, "columns_covered_by_mothers") + #print(columns_not_covered, "columns_not_covered") + y_mid = np.append(y_mid, np.ones(len(columns_not_covered) + + len(x_start_without_mother), + dtype=int) * top) + ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered)) ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) x_starting = np.append(x_starting, np.array(columns_not_covered, int)) x_starting = np.append(x_starting, x_start_without_mother) x_ending = np.append(x_ending, np.array(columns_not_covered, int) + 1) x_ending = np.append(x_ending, x_end_without_mother) - ind_args=np.arange(len(y_type_2)) - #ind_args=np.array(ind_args) + ind_args=np.arange(len(y_mid)) #print(ind_args,'ind_args') for column in range(len(peaks_neg_tot)-1): #print(column,'column') ind_args_in_col=ind_args[x_starting==column] #print('babali2') #print(ind_args_in_col,'ind_args_in_col') - #print(len(y_type_2)) - y_column=y_type_2[ind_args_in_col] + #print(len(y_mid)) + y_mid_column=y_mid[ind_args_in_col] x_start_column=x_starting[ind_args_in_col] x_end_column=x_ending[ind_args_in_col] #print('babali3') - ind_args_col_sorted=np.argsort(y_column) - y_lines_by_order.extend(y_column[ind_args_col_sorted]) + ind_args_col_sorted=np.argsort(y_mid_column) + y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) x_start_by_order.extend(x_start_column[ind_args_col_sorted]) x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) else: @@ -1880,93 +1877,113 @@ def return_boxes_of_images_by_order_of_reading_new( range(x_start_without_mother[dj], x_end_without_mother[dj])) columns_not_covered = list(all_columns - columns_covered_by_mothers) - y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered) + len(x_start_without_mother), - dtype=int) * top) - ##y_lines_by_order = np.append(y_lines_by_order, [top] * len(columns_not_covered)) + #print(columns_covered_by_mothers, "columns_covered_by_mothers") + #print(columns_not_covered, "columns_not_covered") + y_mid = np.append(y_mid, np.ones(len(columns_not_covered) + + len(x_start_without_mother), + dtype=int) * top) + ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered)) ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) x_starting = np.append(x_starting, np.array(columns_not_covered, int)) x_starting = np.append(x_starting, x_start_without_mother) x_ending = np.append(x_ending, np.array(columns_not_covered, int) + 1) x_ending = np.append(x_ending, x_end_without_mother) - columns_covered_by_with_child_no_mothers = set() + columns_covered_by_mothers_with_child = set() for dj in range(len(x_end_with_child_without_mother)): - columns_covered_by_with_child_no_mothers.update( + columns_covered_by_mothers_with_child.update( range(x_start_with_child_without_mother[dj], x_end_with_child_without_mother[dj])) - columns_not_covered_child_no_mother = list( - all_columns - columns_covered_by_with_child_no_mothers) + #print(columns_covered_by_mothers_with_child, "columns_covered_by_mothers_with_child") + columns_not_covered_by_mothers_with_child = list( + all_columns - columns_covered_by_mothers_with_child) #indexes_to_be_spanned=[] for i_s in range(len(x_end_with_child_without_mother)): - columns_not_covered_child_no_mother.append(x_start_with_child_without_mother[i_s]) - columns_not_covered_child_no_mother = np.sort(columns_not_covered_child_no_mother) - ind_args = np.arange(len(y_type_2)) - x_end_with_child_without_mother = np.array(x_end_with_child_without_mother, int) - x_start_with_child_without_mother = np.array(x_start_with_child_without_mother, int) - for i_s_nc in columns_not_covered_child_no_mother: + columns_not_covered_by_mothers_with_child.append(x_start_with_child_without_mother[i_s]) + columns_not_covered_by_mothers_with_child = np.sort(columns_not_covered_by_mothers_with_child) + #print(columns_not_covered_by_mothers_with_child, "columns_not_covered_by_mothers_with_child") + ind_args = np.arange(len(y_mid)) + for i_s_nc in columns_not_covered_by_mothers_with_child: if i_s_nc in x_start_with_child_without_mother: + # use only seps with mother's span ("biggest") #print("i_s_nc", i_s_nc) x_end_biggest_column = \ - x_end_with_child_without_mother[x_start_with_child_without_mother==i_s_nc][0] - args_all_biggest_lines = ind_args[(x_starting==i_s_nc) & - (x_ending==x_end_biggest_column)] - y_column_nc = y_type_2[args_all_biggest_lines] - #x_start_column_nc = x_starting[args_all_biggest_lines] - #x_end_column_nc = x_ending[args_all_biggest_lines] - y_column_nc = np.sort(y_column_nc) - for i_c in range(len(y_column_nc)): + x_end_with_child_without_mother[ + x_start_with_child_without_mother == i_s_nc][0] + args_all_biggest_seps = \ + ind_args[(x_starting == i_s_nc) & + (x_ending == x_end_biggest_column)] + y_mid_column_nc = y_mid[args_all_biggest_seps] + #print("%d:%d" % (i_s_nc, x_end_biggest_column), "columns covered by mother with child") + #x_start_column_nc = x_starting[args_all_biggest_seps] + #x_end_column_nc = x_ending[args_all_biggest_seps] + y_mid_column_nc = np.sort(y_mid_column_nc) + #print(y_mid_column_nc, "y_mid_column_nc (sorted)") + for nc_top, nc_bot in pairwise(np.append(y_mid_column_nc, bot)): #print("i_c", i_c) - ind_all_lines_between_nm_wc = \ - ind_args[(y_type_2 > y_column_nc[i_c]) & - (y_type_2 < (y_column_nc[i_c+1] - if i_c < len(y_column_nc)-1 - else bot)) & + #print("%d:%d" % (nc_top, nc_bot), "y_mid_column_nc") + ind_all_seps_between_nm_wc = \ + ind_args[(y_mid > nc_top) & + (y_mid < nc_bot) & (x_starting >= i_s_nc) & (x_ending <= x_end_biggest_column)] - y_all_between_nm_wc = y_type_2[ind_all_lines_between_nm_wc] - x_starting_all_between_nm_wc = x_starting[ind_all_lines_between_nm_wc] - x_ending_all_between_nm_wc = x_ending[ind_all_lines_between_nm_wc] + y_mid_all_between_nm_wc = y_mid[ind_all_seps_between_nm_wc] + x_starting_all_between_nm_wc = x_starting[ind_all_seps_between_nm_wc] + x_ending_all_between_nm_wc = x_ending[ind_all_seps_between_nm_wc] columns_covered_by_mothers = set() - for dj in range(len(ind_all_lines_between_nm_wc)): + for dj in range(len(ind_all_seps_between_nm_wc)): columns_covered_by_mothers.update( range(x_starting_all_between_nm_wc[dj], x_ending_all_between_nm_wc[dj])) + #print(columns_covered_by_mothers, "columns_covered_by_mothers") child_columns = set(range(i_s_nc, x_end_biggest_column)) columns_not_covered = list(child_columns - columns_covered_by_mothers) + #print(child_columns, "child_columns") + #print(columns_not_covered, "columns_not_covered") - if len(ind_all_lines_between_nm_wc): + if len(ind_all_seps_between_nm_wc): biggest = np.argmax(x_ending_all_between_nm_wc - x_starting_all_between_nm_wc) + #print(ind_all_seps_between_nm_wc, "ind_all_seps_between_nm_wc") + #print(biggest, "%d:%d" % (x_starting_all_between_nm_wc[biggest], + x_ending_all_between_nm_wc[biggest]), "biggest") if columns_covered_by_mothers == set( range(x_starting_all_between_nm_wc[biggest], x_ending_all_between_nm_wc[biggest])): - # biggest accounts for all columns alone, - # longest line should be extended - lines_so_close_to_top_separator = \ - ((y_all_between_nm_wc > y_column_nc[i_c]) & - (y_all_between_nm_wc <= y_column_nc[i_c] + 500)) - if (np.count_nonzero(lines_so_close_to_top_separator) and - np.count_nonzero(lines_so_close_to_top_separator) < - len(ind_all_lines_between_nm_wc)): - y_all_between_nm_wc = \ - y_all_between_nm_wc[~lines_so_close_to_top_separator] + # single biggest accounts for all covered columns alone, + # this separator should be extended to cover all + seps_too_close_to_top_separator = \ + ((y_mid_all_between_nm_wc > nc_top) & + (y_mid_all_between_nm_wc <= nc_top + 500)) + if (np.count_nonzero(seps_too_close_to_top_separator) and + np.count_nonzero(seps_too_close_to_top_separator) < + len(ind_all_seps_between_nm_wc)): + #print(seps_too_close_to_top_separator, "seps_too_close_to_top_separator") + y_mid_all_between_nm_wc = \ + y_mid_all_between_nm_wc[~seps_too_close_to_top_separator] x_starting_all_between_nm_wc = \ - x_starting_all_between_nm_wc[~lines_so_close_to_top_separator] + x_starting_all_between_nm_wc[~seps_too_close_to_top_separator] x_ending_all_between_nm_wc = \ - x_ending_all_between_nm_wc[~lines_so_close_to_top_separator] + x_ending_all_between_nm_wc[~seps_too_close_to_top_separator] - y_all_between_nm_wc = np.append(y_all_between_nm_wc, y_column_nc[i_c]) - x_starting_all_between_nm_wc = np.append(x_starting_all_between_nm_wc, i_s_nc) - x_ending_all_between_nm_wc = np.append(x_ending_all_between_nm_wc, x_end_biggest_column) + y_mid_all_between_nm_wc = np.append( + y_mid_all_between_nm_wc, nc_top) + x_starting_all_between_nm_wc = np.append( + x_starting_all_between_nm_wc, i_s_nc) + x_ending_all_between_nm_wc = np.append( + x_ending_all_between_nm_wc, x_end_biggest_column) else: - y_all_between_nm_wc = np.append(y_all_between_nm_wc, y_column_nc[i_c]) - x_starting_all_between_nm_wc = np.append(x_starting_all_between_nm_wc, x_starting_all_between_nm_wc[biggest]) - x_ending_all_between_nm_wc = np.append(x_ending_all_between_nm_wc, x_ending_all_between_nm_wc[biggest]) + y_mid_all_between_nm_wc = np.append( + y_mid_all_between_nm_wc, nc_top) + x_starting_all_between_nm_wc = np.append( + x_starting_all_between_nm_wc, x_starting_all_between_nm_wc[biggest]) + x_ending_all_between_nm_wc = np.append( + x_ending_all_between_nm_wc, x_ending_all_between_nm_wc[biggest]) if len(columns_not_covered): - y_all_between_nm_wc = np.append( - y_all_between_nm_wc, [y_column_nc[i_c]] * len(columns_not_covered)) + y_mid_all_between_nm_wc = np.append( + y_mid_all_between_nm_wc, [nc_top] * len(columns_not_covered)) x_starting_all_between_nm_wc = np.append( x_starting_all_between_nm_wc, np.array(columns_not_covered, int)) x_ending_all_between_nm_wc = np.append( @@ -1977,52 +1994,53 @@ def return_boxes_of_images_by_order_of_reading_new( ind_args_in_col=ind_args_between[x_starting_all_between_nm_wc==column] #print('babali2') #print(ind_args_in_col,'ind_args_in_col') - #print(len(y_type_2)) - y_column=y_all_between_nm_wc[ind_args_in_col] + #print(len(y_mid)) + y_mid_column=y_mid_all_between_nm_wc[ind_args_in_col] x_start_column=x_starting_all_between_nm_wc[ind_args_in_col] x_end_column=x_ending_all_between_nm_wc[ind_args_in_col] #print('babali3') - ind_args_col_sorted=np.argsort(y_column) - y_lines_by_order.extend(y_column[ind_args_col_sorted]) + ind_args_col_sorted=np.argsort(y_mid_column) + y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) x_start_by_order.extend(x_start_column[ind_args_col_sorted]) x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) else: - #print(column,'column') + #print(i_s_nc,'column not covered by mothers with child') ind_args_in_col=ind_args[x_starting==i_s_nc] #print('babali2') #print(ind_args_in_col,'ind_args_in_col') - #print(len(y_type_2)) - y_column=y_type_2[ind_args_in_col] + #print(len(y_mid)) + y_mid_column=y_mid[ind_args_in_col] x_start_column=x_starting[ind_args_in_col] x_end_column=x_ending[ind_args_in_col] #print('babali3') - ind_args_col_sorted = np.argsort(y_column) - y_lines_by_order.extend(y_column[ind_args_col_sorted]) + ind_args_col_sorted = np.argsort(y_mid_column) + y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) x_start_by_order.extend(x_start_column[ind_args_col_sorted]) x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) - y_lines_by_order = np.array(y_lines_by_order) + # create single-column boxes from multi-column separators + y_mid_by_order = np.array(y_mid_by_order) x_start_by_order = np.array(x_start_by_order) x_end_by_order = np.array(x_end_by_order) - for il in range(len(y_lines_by_order)): + for il in range(len(y_mid_by_order)): #print(il, "il") - y_itself = y_lines_by_order[il] + y_mid_itself = y_mid_by_order[il] x_start_itself = x_start_by_order[il] x_end_itself = x_end_by_order[il] for column in range(int(x_start_itself), int(x_end_itself)+1): #print(column,'cols') - y_in_cols = y_lines_by_order[(y_itself < y_lines_by_order) & - (column >= x_start_by_order) & - (column <= x_end_by_order)] #print('burda') - y_down = y_in_cols.min(initial=bot) #print('burda2') - #print(y_in_cols,'y_in_cols') - #print(y_itself,'y_itself') + y_mid_next = y_mid_by_order[(y_mid_itself < y_mid_by_order) & + (column >= x_start_by_order) & + (column <= x_end_by_order)] + y_mid_next = y_mid_next.min(initial=bot) + #print(y_mid_next,'y_mid_next') + #print(y_mid_itself,'y_mid_itself') boxes.append([peaks_neg_tot[column], peaks_neg_tot[column+1], - y_itself, - y_down]) + y_mid_itself, + y_mid_next]) # dbg_plt(boxes[-1], "A column %d box" % (column + 1)) except: logger.exception("cannot assign boxes") @@ -2030,20 +2048,21 @@ def return_boxes_of_images_by_order_of_reading_new( top, bot]) # dbg_plt(boxes[-1], "fallback box") else: - y_lines_by_order=[] + # order multi-column separators + y_mid_by_order=[] x_start_by_order=[] x_end_by_order=[] if len(x_starting)>0: - columns_covered_by_lines_covered_more_than_2col = set() + columns_covered_by_seps_covered_more_than_2col = set() for dj in range(len(x_starting)): if set(range(x_starting[dj], x_ending[dj])) != all_columns: - columns_covered_by_lines_covered_more_than_2col.update( + columns_covered_by_seps_covered_more_than_2col.update( range(x_starting[dj], x_ending[dj])) - columns_not_covered = list(all_columns - columns_covered_by_lines_covered_more_than_2col) + columns_not_covered = list(all_columns - columns_covered_by_seps_covered_more_than_2col) - y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered) + 1, - dtype=int) * top) - ##y_lines_by_order = np.append(y_lines_by_order, [top] * len(columns_not_covered)) + y_mid = np.append(y_mid, np.ones(len(columns_not_covered) + 1, + dtype=int) * top) + ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered)) ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype)) x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1) @@ -2055,53 +2074,52 @@ def return_boxes_of_images_by_order_of_reading_new( x_ending = np.append(x_ending, x_ending[0]) else: columns_not_covered = list(all_columns) - y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered), - dtype=int) * top) - ##y_lines_by_order = np.append(y_lines_by_order, [top] * len(columns_not_covered)) + y_mid = np.append(y_mid, np.ones(len(columns_not_covered), + dtype=int) * top) + ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered)) ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype)) x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1) - ind_args = np.arange(len(y_type_2)) - + ind_args = np.arange(len(y_mid)) + for column in range(len(peaks_neg_tot)-1): #print(column,'column') ind_args_in_col=ind_args[x_starting==column] - #print(len(y_type_2)) - y_column=y_type_2[ind_args_in_col] + #print(len(y_mid)) + y_mid_column=y_mid[ind_args_in_col] x_start_column=x_starting[ind_args_in_col] x_end_column=x_ending[ind_args_in_col] - ind_args_col_sorted = np.argsort(y_column) - y_lines_by_order.extend(y_column[ind_args_col_sorted]) + ind_args_col_sorted = np.argsort(y_mid_column) + y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) x_start_by_order.extend(x_start_column[ind_args_col_sorted]) x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) - y_lines_by_order = np.array(y_lines_by_order) + # create single-column boxes from multi-column separators + y_mid_by_order = np.array(y_mid_by_order) x_start_by_order = np.array(x_start_by_order) x_end_by_order = np.array(x_end_by_order) - for il in range(len(y_lines_by_order)): + for il in range(len(y_mid_by_order)): #print(il, "il") - y_itself = y_lines_by_order[il] - #print(y_itself,'y_itself') + y_mid_itself = y_mid_by_order[il] + #print(y_mid_itself,'y_mid_itself') x_start_itself = x_start_by_order[il] x_end_itself = x_end_by_order[il] for column in range(x_start_itself, x_end_itself+1): #print(column,'cols') - y_in_cols = y_lines_by_order[(y_itself < y_lines_by_order) & - (column >= x_start_by_order) & - (column <= x_end_by_order)] #print('burda2') - #print(y_in_cols,'y_in_cols') - y_down = y_in_cols.min(initial=bot) - #print(y_down,'y_down') + y_mid_next = y_mid_by_order[(y_mid_itself < y_mid_by_order) & + (column >= x_start_by_order) & + (column <= x_end_by_order)] + #print(y_mid_next,'y_mid_next') + y_mid_next = y_mid_next.min(initial=bot) + #print(y_mid_next,'y_mid_next') boxes.append([peaks_neg_tot[column], peaks_neg_tot[column+1], - y_itself, - y_down]) + y_mid_itself, + y_mid_next]) # dbg_plt(boxes[-1], "B column %d box" % (column + 1)) - #else: - #boxes.append([ 0, regions_without_separators[:,:].shape[1] ,top, bot]) if right2left_readingorder: peaks_neg_tot_tables_new = [] @@ -2119,11 +2137,7 @@ def return_boxes_of_images_by_order_of_reading_new( peaks_neg_tot_tables = peaks_neg_tot_tables_new # show final xy-cut - # plt.imshow(regions_without_separators) - # for xmin, xmax, ymin, ymax in boxes: - # plt.gca().add_patch(patches.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, - # fill=False, linewidth=1, edgecolor='r')) - # plt.show() + # dbg_plt(None, "final XY-Cut", boxes, True) logger.debug('exit return_boxes_of_images_by_order_of_reading_new') return boxes, peaks_neg_tot_tables From 66a0e55e49e4224e38c9792d06d2468c7fe8fe90 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 24 Oct 2025 02:15:13 +0200 Subject: [PATCH 017/118] `return_boxes_of_images_by_order_of_reading_new`: avoid oversplits when y slice (`top:bot`) is not a significant part of the page, viz. less than 22% (as in `find_number_of_columns_in_document`), avoid forcing `find_num_col` to reach `num_col_classifier` (allows large headers not to be split up and thus better ordered) --- src/eynollah/utils/__init__.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index eca96f3..2017cea 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1628,7 +1628,8 @@ def return_boxes_of_images_by_order_of_reading_new( boxes=[] peaks_neg_tot_tables = [] splitter_y_new = np.array(splitter_y_new, dtype=int) - width_tot = regions_without_separators.shape[1] + height_tot, width_tot = regions_without_separators.shape + big_part = 22 * height_tot // 100 # percent height for top, bot in pairwise(splitter_y_new): # print("%d:%d" % (top, bot), 'i') # dbg_plt([0, None, top, bot], "image cut for y split %d:%d" % (top, bot)) @@ -1644,12 +1645,17 @@ def return_boxes_of_images_by_order_of_reading_new( try: num_col, peaks_neg_fin = find_num_col( regions_without_separators[top:bot], - num_col_classifier, tables, multiplier=6. if erosion_hurts else 7.) + # we do not expect to get all columns in small parts (headings etc.): + num_col_classifier if bot - top >= big_part else 1, + tables, multiplier=6. if erosion_hurts else 7.) except: peaks_neg_fin=[] num_col = 0 try: - if (len(peaks_neg_fin)+1)= big_part): # found too few columns here #print('burda') peaks_neg_fin_org = np.copy(peaks_neg_fin) From 3ebbc2d693ae14a640c3cb478b6a01cd1e42efb7 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 24 Oct 2025 02:30:39 +0200 Subject: [PATCH 018/118] `return_boxes_of_images_by_order_of_reading_new`: indent (by removing unnecessary conditional) --- src/eynollah/utils/__init__.py | 843 ++++++++++++++++----------------- 1 file changed, 421 insertions(+), 422 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 2017cea..f30d55e 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1641,241 +1641,204 @@ def return_boxes_of_images_by_order_of_reading_new( #if (len(matrix_new[:,9][matrix_new[:,9]==1]) > 0 and # np.max(matrix_new[:,8][matrix_new[:,9]==1]) >= # 0.1 * (np.abs(bot-top))): - if True: - try: - num_col, peaks_neg_fin = find_num_col( - regions_without_separators[top:bot], - # we do not expect to get all columns in small parts (headings etc.): - num_col_classifier if bot - top >= big_part else 1, - tables, multiplier=6. if erosion_hurts else 7.) - except: - peaks_neg_fin=[] - num_col = 0 - try: - if ((len(peaks_neg_fin) + 1 < num_col_classifier or - num_col_classifier == 6) and - # we do not expect to get all columns in small parts (headings etc.): - bot - top >= big_part): - # found too few columns here - #print('burda') - peaks_neg_fin_org = np.copy(peaks_neg_fin) - #print("peaks_neg_fin_org", peaks_neg_fin_org) - if len(peaks_neg_fin)==0: - num_col, peaks_neg_fin = find_num_col( - regions_without_separators[top:bot], - num_col_classifier, tables, multiplier=3.) - #print(peaks_neg_fin,'peaks_neg_fin') - peaks_neg_fin_early = [0] + peaks_neg_fin + [width_tot-1] + try: + num_col, peaks_neg_fin = find_num_col( + regions_without_separators[top:bot], + # we do not expect to get all columns in small parts (headings etc.): + num_col_classifier if bot - top >= big_part else 1, + tables, multiplier=6. if erosion_hurts else 7.) + except: + peaks_neg_fin=[] + num_col = 0 + try: + if ((len(peaks_neg_fin) + 1 < num_col_classifier or + num_col_classifier == 6) and + # we do not expect to get all columns in small parts (headings etc.): + bot - top >= big_part): + # found too few columns here + #print('burda') + peaks_neg_fin_org = np.copy(peaks_neg_fin) + #print("peaks_neg_fin_org", peaks_neg_fin_org) + if len(peaks_neg_fin)==0: + num_col, peaks_neg_fin = find_num_col( + regions_without_separators[top:bot], + num_col_classifier, tables, multiplier=3.) + #print(peaks_neg_fin,'peaks_neg_fin') + peaks_neg_fin_early = [0] + peaks_neg_fin + [width_tot-1] - #print(peaks_neg_fin_early,'burda2') - peaks_neg_fin_rev=[] - for left, right in pairwise(peaks_neg_fin_early): - # print("%d:%d" % (left, right), 'i_n') - # dbg_plt([left, right, top, bot], - # "image cut for y split %d:%d / x gap %d:%d" % ( - # top, bot, left, right)) - # plt.plot(regions_without_separators[top:bot, left:right].sum(axis=0)) - # plt.title("vertical projection (sum over y)") - # plt.show() - try: - _, peaks_neg_fin1 = find_num_col( - regions_without_separators[top:bot, left:right], - num_col_classifier, tables, multiplier=7.) - except: - peaks_neg_fin1 = [] - try: - _, peaks_neg_fin2 = find_num_col( - regions_without_separators[top:bot, left:right], - num_col_classifier, tables, multiplier=5.) - except: - peaks_neg_fin2 = [] - if len(peaks_neg_fin1) >= len(peaks_neg_fin2): - peaks_neg_fin = peaks_neg_fin1 - else: - peaks_neg_fin = peaks_neg_fin2 - # add offset to local result - peaks_neg_fin = list(np.array(peaks_neg_fin) + left) - #print(peaks_neg_fin,'peaks_neg_fin') - - peaks_neg_fin_rev.extend(peaks_neg_fin) - if right < peaks_neg_fin_early[-1]: - # all but the last column: interject the preexisting boundary - peaks_neg_fin_rev.append(right) - #print(peaks_neg_fin_rev,'peaks_neg_fin_rev') - - if len(peaks_neg_fin_rev) >= len(peaks_neg_fin_org): - peaks_neg_fin = peaks_neg_fin_rev + #print(peaks_neg_fin_early,'burda2') + peaks_neg_fin_rev=[] + for left, right in pairwise(peaks_neg_fin_early): + # print("%d:%d" % (left, right), 'i_n') + # dbg_plt([left, right, top, bot], + # "image cut for y split %d:%d / x gap %d:%d" % ( + # top, bot, left, right)) + # plt.plot(regions_without_separators[top:bot, left:right].sum(axis=0)) + # plt.title("vertical projection (sum over y)") + # plt.show() + try: + _, peaks_neg_fin1 = find_num_col( + regions_without_separators[top:bot, left:right], + num_col_classifier, tables, multiplier=7.) + except: + peaks_neg_fin1 = [] + try: + _, peaks_neg_fin2 = find_num_col( + regions_without_separators[top:bot, left:right], + num_col_classifier, tables, multiplier=5.) + except: + peaks_neg_fin2 = [] + if len(peaks_neg_fin1) >= len(peaks_neg_fin2): + peaks_neg_fin = peaks_neg_fin1 else: - peaks_neg_fin = peaks_neg_fin_org - num_col = len(peaks_neg_fin) + peaks_neg_fin = peaks_neg_fin2 + # add offset to local result + peaks_neg_fin = list(np.array(peaks_neg_fin) + left) #print(peaks_neg_fin,'peaks_neg_fin') - except: - logger.exception("cannot find peaks consistent with columns") - #num_col, peaks_neg_fin = find_num_col( - # regions_without_separators[top:bot,:], - # multiplier=7.0) - x_min_hor_some=matrix_new[:,2][ (matrix_new[:,9]==0) ] - x_max_hor_some=matrix_new[:,3][ (matrix_new[:,9]==0) ] - cy_hor_some=matrix_new[:,5][ (matrix_new[:,9]==0) ] - y_max_hor_some=matrix_new[:,7][ (matrix_new[:,9]==0) ] - if right2left_readingorder: - x_max_hor_some_new = width_tot - x_min_hor_some - x_min_hor_some_new = width_tot - x_max_hor_some - x_min_hor_some =list(np.copy(x_min_hor_some_new)) - x_max_hor_some =list(np.copy(x_max_hor_some_new)) + peaks_neg_fin_rev.extend(peaks_neg_fin) + if right < peaks_neg_fin_early[-1]: + # all but the last column: interject the preexisting boundary + peaks_neg_fin_rev.append(right) + #print(peaks_neg_fin_rev,'peaks_neg_fin_rev') - peaks_neg_tot = np.array([0] + peaks_neg_fin + [width_tot]) - #print(peaks_neg_tot,'peaks_neg_tot') - peaks_neg_tot_tables.append(peaks_neg_tot) + if len(peaks_neg_fin_rev) >= len(peaks_neg_fin_org): + peaks_neg_fin = peaks_neg_fin_rev + else: + peaks_neg_fin = peaks_neg_fin_org + num_col = len(peaks_neg_fin) + #print(peaks_neg_fin,'peaks_neg_fin') + except: + logger.exception("cannot find peaks consistent with columns") + #num_col, peaks_neg_fin = find_num_col( + # regions_without_separators[top:bot,:], + # multiplier=7.0) + x_min_hor_some=matrix_new[:,2][ (matrix_new[:,9]==0) ] + x_max_hor_some=matrix_new[:,3][ (matrix_new[:,9]==0) ] + cy_hor_some=matrix_new[:,5][ (matrix_new[:,9]==0) ] + y_max_hor_some=matrix_new[:,7][ (matrix_new[:,9]==0) ] - all_columns = set(range(len(peaks_neg_tot) - 1)) - #print("all_columns", all_columns) + if right2left_readingorder: + x_max_hor_some_new = width_tot - x_min_hor_some + x_min_hor_some_new = width_tot - x_max_hor_some + x_min_hor_some =list(np.copy(x_min_hor_some_new)) + x_max_hor_some =list(np.copy(x_max_hor_some_new)) - reading_order_type, x_starting, x_ending, y_mid, y_max, \ - y_mid_without_mother, x_start_without_mother, x_end_without_mother, \ - there_is_sep_with_child, \ - y_mid_with_child_without_mother, x_start_with_child_without_mother, x_end_with_child_without_mother, \ - new_main_sep_y = return_x_start_end_mothers_childs_and_type_of_reading_order( - x_min_hor_some, x_max_hor_some, cy_hor_some, peaks_neg_tot, y_max_hor_some) + peaks_neg_tot = np.array([0] + peaks_neg_fin + [width_tot]) + #print(peaks_neg_tot,'peaks_neg_tot') + peaks_neg_tot_tables.append(peaks_neg_tot) - # show multi-column separators - # dbg_plt([0, None, top, bot], "multi-column separators in current split", - # list(zip(peaks_neg_tot[x_starting], peaks_neg_tot[x_ending], - # y_mid - top, y_max - top)), True) + all_columns = set(range(len(peaks_neg_tot) - 1)) + #print("all_columns", all_columns) - if (reading_order_type == 1 or - len(y_mid_without_mother) >= 2 or - there_is_sep_with_child == 1): - # there are top-level multi-colspan horizontal separators which overlap each other - # or multiple top-level multi-colspan horizontal separators - # or multi-colspan horizontal separators shorter than their respective top-level: - # todo: explain how this is dealt with - try: - y_grenze = top + 300 - up = (y_mid > top) & (y_mid <= y_grenze) + reading_order_type, x_starting, x_ending, y_mid, y_max, \ + y_mid_without_mother, x_start_without_mother, x_end_without_mother, \ + there_is_sep_with_child, \ + y_mid_with_child_without_mother, x_start_with_child_without_mother, x_end_with_child_without_mother, \ + new_main_sep_y = return_x_start_end_mothers_childs_and_type_of_reading_order( + x_min_hor_some, x_max_hor_some, cy_hor_some, peaks_neg_tot, y_max_hor_some) - args_early_ys=np.arange(len(y_mid)) - #print(args_early_ys,'args_early_ys') - #print(y_mid,'y_mid') + # show multi-column separators + # dbg_plt([0, None, top, bot], "multi-column separators in current split", + # list(zip(peaks_neg_tot[x_starting], peaks_neg_tot[x_ending], + # y_mid - top, y_max - top)), True) - x_starting_up = x_starting[up] - x_ending_up = x_ending[up] - y_mid_up = y_mid[up] - y_max_up = y_max[up] - args_up = args_early_ys[up] - #print(args_up,'args_up') - #print(y_mid_up,'y_mid_up') - #check if there is a big separator in this y_mains0 - if len(y_mid_up) > 0: - # is there a separator with full-width span? - main_separator = (x_starting_up == 0) & (x_ending_up == len(peaks_neg_tot) - 1) - y_mid_main_separator_up = y_mid_up[main_separator] - y_max_main_separator_up = y_max_up[main_separator] - args_main_to_deleted = args_up[main_separator] - #print(y_mid_main_separator_up,y_max_main_separator_up,args_main_to_deleted,'fffffjammmm') - if len(y_max_main_separator_up): - args_to_be_kept = np.array(list( set(args_early_ys) - set(args_main_to_deleted) )) - #print(args_to_be_kept,'args_to_be_kept') - boxes.append([0, peaks_neg_tot[-1], - top, y_max_main_separator_up.max()]) - # dbg_plt(boxes[-1], "near top main separator box") - top = y_max_main_separator_up.max() + if (reading_order_type == 1 or + len(y_mid_without_mother) >= 2 or + there_is_sep_with_child == 1): + # there are top-level multi-colspan horizontal separators which overlap each other + # or multiple top-level multi-colspan horizontal separators + # or multi-colspan horizontal separators shorter than their respective top-level: + # todo: explain how this is dealt with + try: + y_grenze = top + 300 + up = (y_mid > top) & (y_mid <= y_grenze) - #print(top,'top') - y_mid = y_mid[args_to_be_kept] - x_starting = x_starting[args_to_be_kept] - x_ending = x_ending[args_to_be_kept] - y_max = y_max[args_to_be_kept] + args_early_ys=np.arange(len(y_mid)) + #print(args_early_ys,'args_early_ys') + #print(y_mid,'y_mid') - #print('galdiha') - y_grenze = top + 200 - up = (y_mid > top) & (y_mid <= y_grenze) - args_early_ys2 = np.arange(len(y_mid)) - x_starting_up = x_starting[up] - x_ending_up = x_ending[up] - y_mid_up = y_mid[up] - y_max_up = y_max[up] - args_up2 = args_early_ys2[up] - #print(y_mid_up,x_starting_up,x_ending_up,'didid') - else: - args_early_ys2 = args_early_ys - args_up2 = args_up + x_starting_up = x_starting[up] + x_ending_up = x_ending[up] + y_mid_up = y_mid[up] + y_max_up = y_max[up] + args_up = args_early_ys[up] + #print(args_up,'args_up') + #print(y_mid_up,'y_mid_up') + #check if there is a big separator in this y_mains0 + if len(y_mid_up) > 0: + # is there a separator with full-width span? + main_separator = (x_starting_up == 0) & (x_ending_up == len(peaks_neg_tot) - 1) + y_mid_main_separator_up = y_mid_up[main_separator] + y_max_main_separator_up = y_max_up[main_separator] + args_main_to_deleted = args_up[main_separator] + #print(y_mid_main_separator_up,y_max_main_separator_up,args_main_to_deleted,'fffffjammmm') + if len(y_max_main_separator_up): + args_to_be_kept = np.array(list( set(args_early_ys) - set(args_main_to_deleted) )) + #print(args_to_be_kept,'args_to_be_kept') + boxes.append([0, peaks_neg_tot[-1], + top, y_max_main_separator_up.max()]) + # dbg_plt(boxes[-1], "near top main separator box") + top = y_max_main_separator_up.max() - nodes_in = set() - for ij in range(len(x_starting_up)): - nodes_in.update(range(x_starting_up[ij], - x_ending_up[ij])) - #print(nodes_in,'nodes_in') - #print(np.array(range(len(peaks_neg_tot)-1)),'np.array(range(len(peaks_neg_tot)-1))') + #print(top,'top') + y_mid = y_mid[args_to_be_kept] + x_starting = x_starting[args_to_be_kept] + x_ending = x_ending[args_to_be_kept] + y_max = y_max[args_to_be_kept] - if nodes_in == set(range(len(peaks_neg_tot)-1)): - pass - elif nodes_in == set(range(1, len(peaks_neg_tot)-1)): - pass - else: - #print('burdaydikh') - args_to_be_kept2 = np.array(list( set(args_early_ys2) - set(args_up2) )) - - if len(args_to_be_kept2): - #print(args_to_be_kept2, "args_to_be_kept2") - y_mid = y_mid[args_to_be_kept2] - x_starting = x_starting[args_to_be_kept2] - x_ending = x_ending[args_to_be_kept2] - y_max = y_max[args_to_be_kept2] - - #int(top) - # order multi-column separators - y_mid_by_order=[] - x_start_by_order=[] - x_end_by_order=[] - if (reading_order_type == 1 or - len(x_end_with_child_without_mother) == 0): - if reading_order_type == 1: - # there are top-level multi-colspan horizontal separators which overlap each other - #print("adding all columns at top because of multiple overlapping mothers") - y_mid_by_order.append(top) - x_start_by_order.append(0) - x_end_by_order.append(len(peaks_neg_tot)-2) - else: - # there are no top-level multi-colspan horizontal separators which themselves - # contain shorter multi-colspan separators - #print(x_start_without_mother,x_end_without_mother,peaks_neg_tot,'dodo') - columns_covered_by_mothers = set() - for dj in range(len(x_start_without_mother)): - columns_covered_by_mothers.update( - range(x_start_without_mother[dj], - x_end_without_mother[dj])) - columns_not_covered = list(all_columns - columns_covered_by_mothers) - #print(columns_covered_by_mothers, "columns_covered_by_mothers") - #print(columns_not_covered, "columns_not_covered") - y_mid = np.append(y_mid, np.ones(len(columns_not_covered) + - len(x_start_without_mother), - dtype=int) * top) - ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered)) - ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) - x_starting = np.append(x_starting, np.array(columns_not_covered, int)) - x_starting = np.append(x_starting, x_start_without_mother) - x_ending = np.append(x_ending, np.array(columns_not_covered, int) + 1) - x_ending = np.append(x_ending, x_end_without_mother) - - ind_args=np.arange(len(y_mid)) - #print(ind_args,'ind_args') - for column in range(len(peaks_neg_tot)-1): - #print(column,'column') - ind_args_in_col=ind_args[x_starting==column] - #print('babali2') - #print(ind_args_in_col,'ind_args_in_col') - #print(len(y_mid)) - y_mid_column=y_mid[ind_args_in_col] - x_start_column=x_starting[ind_args_in_col] - x_end_column=x_ending[ind_args_in_col] - #print('babali3') - ind_args_col_sorted=np.argsort(y_mid_column) - y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) - x_start_by_order.extend(x_start_column[ind_args_col_sorted]) - x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) + #print('galdiha') + y_grenze = top + 200 + up = (y_mid > top) & (y_mid <= y_grenze) + args_early_ys2 = np.arange(len(y_mid)) + x_starting_up = x_starting[up] + x_ending_up = x_ending[up] + y_mid_up = y_mid[up] + y_max_up = y_max[up] + args_up2 = args_early_ys2[up] + #print(y_mid_up,x_starting_up,x_ending_up,'didid') else: + args_early_ys2 = args_early_ys + args_up2 = args_up + + nodes_in = set() + for ij in range(len(x_starting_up)): + nodes_in.update(range(x_starting_up[ij], + x_ending_up[ij])) + #print(nodes_in,'nodes_in') + #print(np.array(range(len(peaks_neg_tot)-1)),'np.array(range(len(peaks_neg_tot)-1))') + + if nodes_in == set(range(len(peaks_neg_tot)-1)): + pass + elif nodes_in == set(range(1, len(peaks_neg_tot)-1)): + pass + else: + #print('burdaydikh') + args_to_be_kept2 = np.array(list( set(args_early_ys2) - set(args_up2) )) + + if len(args_to_be_kept2): + #print(args_to_be_kept2, "args_to_be_kept2") + y_mid = y_mid[args_to_be_kept2] + x_starting = x_starting[args_to_be_kept2] + x_ending = x_ending[args_to_be_kept2] + y_max = y_max[args_to_be_kept2] + + #int(top) + # order multi-column separators + y_mid_by_order=[] + x_start_by_order=[] + x_end_by_order=[] + if (reading_order_type == 1 or + len(x_end_with_child_without_mother) == 0): + if reading_order_type == 1: + # there are top-level multi-colspan horizontal separators which overlap each other + #print("adding all columns at top because of multiple overlapping mothers") + y_mid_by_order.append(top) + x_start_by_order.append(0) + x_end_by_order.append(len(peaks_neg_tot)-2) + else: + # there are no top-level multi-colspan horizontal separators which themselves + # contain shorter multi-colspan separators #print(x_start_without_mother,x_end_without_mother,peaks_neg_tot,'dodo') columns_covered_by_mothers = set() for dj in range(len(x_start_without_mother)): @@ -1895,212 +1858,170 @@ def return_boxes_of_images_by_order_of_reading_new( x_ending = np.append(x_ending, np.array(columns_not_covered, int) + 1) x_ending = np.append(x_ending, x_end_without_mother) - columns_covered_by_mothers_with_child = set() - for dj in range(len(x_end_with_child_without_mother)): - columns_covered_by_mothers_with_child.update( - range(x_start_with_child_without_mother[dj], - x_end_with_child_without_mother[dj])) - #print(columns_covered_by_mothers_with_child, "columns_covered_by_mothers_with_child") - columns_not_covered_by_mothers_with_child = list( - all_columns - columns_covered_by_mothers_with_child) - #indexes_to_be_spanned=[] - for i_s in range(len(x_end_with_child_without_mother)): - columns_not_covered_by_mothers_with_child.append(x_start_with_child_without_mother[i_s]) - columns_not_covered_by_mothers_with_child = np.sort(columns_not_covered_by_mothers_with_child) - #print(columns_not_covered_by_mothers_with_child, "columns_not_covered_by_mothers_with_child") - ind_args = np.arange(len(y_mid)) - for i_s_nc in columns_not_covered_by_mothers_with_child: - if i_s_nc in x_start_with_child_without_mother: - # use only seps with mother's span ("biggest") - #print("i_s_nc", i_s_nc) - x_end_biggest_column = \ - x_end_with_child_without_mother[ - x_start_with_child_without_mother == i_s_nc][0] - args_all_biggest_seps = \ - ind_args[(x_starting == i_s_nc) & - (x_ending == x_end_biggest_column)] - y_mid_column_nc = y_mid[args_all_biggest_seps] - #print("%d:%d" % (i_s_nc, x_end_biggest_column), "columns covered by mother with child") - #x_start_column_nc = x_starting[args_all_biggest_seps] - #x_end_column_nc = x_ending[args_all_biggest_seps] - y_mid_column_nc = np.sort(y_mid_column_nc) - #print(y_mid_column_nc, "y_mid_column_nc (sorted)") - for nc_top, nc_bot in pairwise(np.append(y_mid_column_nc, bot)): - #print("i_c", i_c) - #print("%d:%d" % (nc_top, nc_bot), "y_mid_column_nc") - ind_all_seps_between_nm_wc = \ - ind_args[(y_mid > nc_top) & - (y_mid < nc_bot) & - (x_starting >= i_s_nc) & - (x_ending <= x_end_biggest_column)] - y_mid_all_between_nm_wc = y_mid[ind_all_seps_between_nm_wc] - x_starting_all_between_nm_wc = x_starting[ind_all_seps_between_nm_wc] - x_ending_all_between_nm_wc = x_ending[ind_all_seps_between_nm_wc] - - columns_covered_by_mothers = set() - for dj in range(len(ind_all_seps_between_nm_wc)): - columns_covered_by_mothers.update( - range(x_starting_all_between_nm_wc[dj], - x_ending_all_between_nm_wc[dj])) - #print(columns_covered_by_mothers, "columns_covered_by_mothers") - child_columns = set(range(i_s_nc, x_end_biggest_column)) - columns_not_covered = list(child_columns - columns_covered_by_mothers) - #print(child_columns, "child_columns") - #print(columns_not_covered, "columns_not_covered") - - if len(ind_all_seps_between_nm_wc): - biggest = np.argmax(x_ending_all_between_nm_wc - - x_starting_all_between_nm_wc) - #print(ind_all_seps_between_nm_wc, "ind_all_seps_between_nm_wc") - #print(biggest, "%d:%d" % (x_starting_all_between_nm_wc[biggest], - x_ending_all_between_nm_wc[biggest]), "biggest") - if columns_covered_by_mothers == set( - range(x_starting_all_between_nm_wc[biggest], - x_ending_all_between_nm_wc[biggest])): - # single biggest accounts for all covered columns alone, - # this separator should be extended to cover all - seps_too_close_to_top_separator = \ - ((y_mid_all_between_nm_wc > nc_top) & - (y_mid_all_between_nm_wc <= nc_top + 500)) - if (np.count_nonzero(seps_too_close_to_top_separator) and - np.count_nonzero(seps_too_close_to_top_separator) < - len(ind_all_seps_between_nm_wc)): - #print(seps_too_close_to_top_separator, "seps_too_close_to_top_separator") - y_mid_all_between_nm_wc = \ - y_mid_all_between_nm_wc[~seps_too_close_to_top_separator] - x_starting_all_between_nm_wc = \ - x_starting_all_between_nm_wc[~seps_too_close_to_top_separator] - x_ending_all_between_nm_wc = \ - x_ending_all_between_nm_wc[~seps_too_close_to_top_separator] - - y_mid_all_between_nm_wc = np.append( - y_mid_all_between_nm_wc, nc_top) - x_starting_all_between_nm_wc = np.append( - x_starting_all_between_nm_wc, i_s_nc) - x_ending_all_between_nm_wc = np.append( - x_ending_all_between_nm_wc, x_end_biggest_column) - else: - y_mid_all_between_nm_wc = np.append( - y_mid_all_between_nm_wc, nc_top) - x_starting_all_between_nm_wc = np.append( - x_starting_all_between_nm_wc, x_starting_all_between_nm_wc[biggest]) - x_ending_all_between_nm_wc = np.append( - x_ending_all_between_nm_wc, x_ending_all_between_nm_wc[biggest]) - - if len(columns_not_covered): - y_mid_all_between_nm_wc = np.append( - y_mid_all_between_nm_wc, [nc_top] * len(columns_not_covered)) - x_starting_all_between_nm_wc = np.append( - x_starting_all_between_nm_wc, np.array(columns_not_covered, int)) - x_ending_all_between_nm_wc = np.append( - x_ending_all_between_nm_wc, np.array(columns_not_covered, int) + 1) - - ind_args_between=np.arange(len(x_ending_all_between_nm_wc)) - for column in range(int(i_s_nc), int(x_end_biggest_column)): - ind_args_in_col=ind_args_between[x_starting_all_between_nm_wc==column] - #print('babali2') - #print(ind_args_in_col,'ind_args_in_col') - #print(len(y_mid)) - y_mid_column=y_mid_all_between_nm_wc[ind_args_in_col] - x_start_column=x_starting_all_between_nm_wc[ind_args_in_col] - x_end_column=x_ending_all_between_nm_wc[ind_args_in_col] - #print('babali3') - ind_args_col_sorted=np.argsort(y_mid_column) - y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) - x_start_by_order.extend(x_start_column[ind_args_col_sorted]) - x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) - else: - #print(i_s_nc,'column not covered by mothers with child') - ind_args_in_col=ind_args[x_starting==i_s_nc] - #print('babali2') - #print(ind_args_in_col,'ind_args_in_col') - #print(len(y_mid)) - y_mid_column=y_mid[ind_args_in_col] - x_start_column=x_starting[ind_args_in_col] - x_end_column=x_ending[ind_args_in_col] - #print('babali3') - ind_args_col_sorted = np.argsort(y_mid_column) - y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) - x_start_by_order.extend(x_start_column[ind_args_col_sorted]) - x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) - - # create single-column boxes from multi-column separators - y_mid_by_order = np.array(y_mid_by_order) - x_start_by_order = np.array(x_start_by_order) - x_end_by_order = np.array(x_end_by_order) - for il in range(len(y_mid_by_order)): - #print(il, "il") - y_mid_itself = y_mid_by_order[il] - x_start_itself = x_start_by_order[il] - x_end_itself = x_end_by_order[il] - for column in range(int(x_start_itself), int(x_end_itself)+1): - #print(column,'cols') - #print('burda') - #print('burda2') - y_mid_next = y_mid_by_order[(y_mid_itself < y_mid_by_order) & - (column >= x_start_by_order) & - (column <= x_end_by_order)] - y_mid_next = y_mid_next.min(initial=bot) - #print(y_mid_next,'y_mid_next') - #print(y_mid_itself,'y_mid_itself') - boxes.append([peaks_neg_tot[column], - peaks_neg_tot[column+1], - y_mid_itself, - y_mid_next]) - # dbg_plt(boxes[-1], "A column %d box" % (column + 1)) - except: - logger.exception("cannot assign boxes") - boxes.append([0, peaks_neg_tot[len(peaks_neg_tot)-1], - top, bot]) - # dbg_plt(boxes[-1], "fallback box") - else: - # order multi-column separators - y_mid_by_order=[] - x_start_by_order=[] - x_end_by_order=[] - if len(x_starting)>0: - columns_covered_by_seps_covered_more_than_2col = set() - for dj in range(len(x_starting)): - if set(range(x_starting[dj], x_ending[dj])) != all_columns: - columns_covered_by_seps_covered_more_than_2col.update( - range(x_starting[dj], x_ending[dj])) - columns_not_covered = list(all_columns - columns_covered_by_seps_covered_more_than_2col) - - y_mid = np.append(y_mid, np.ones(len(columns_not_covered) + 1, - dtype=int) * top) - ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered)) - ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) - x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype)) - x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1) - if len(new_main_sep_y) > 0: - x_starting = np.append(x_starting, 0) - x_ending = np.append(x_ending, len(peaks_neg_tot) - 1) - else: - x_starting = np.append(x_starting, x_starting[0]) - x_ending = np.append(x_ending, x_ending[0]) + ind_args=np.arange(len(y_mid)) + #print(ind_args,'ind_args') + for column in range(len(peaks_neg_tot)-1): + #print(column,'column') + ind_args_in_col=ind_args[x_starting==column] + #print('babali2') + #print(ind_args_in_col,'ind_args_in_col') + #print(len(y_mid)) + y_mid_column=y_mid[ind_args_in_col] + x_start_column=x_starting[ind_args_in_col] + x_end_column=x_ending[ind_args_in_col] + #print('babali3') + ind_args_col_sorted=np.argsort(y_mid_column) + y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) + x_start_by_order.extend(x_start_column[ind_args_col_sorted]) + x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) else: - columns_not_covered = list(all_columns) - y_mid = np.append(y_mid, np.ones(len(columns_not_covered), + #print(x_start_without_mother,x_end_without_mother,peaks_neg_tot,'dodo') + columns_covered_by_mothers = set() + for dj in range(len(x_start_without_mother)): + columns_covered_by_mothers.update( + range(x_start_without_mother[dj], + x_end_without_mother[dj])) + columns_not_covered = list(all_columns - columns_covered_by_mothers) + #print(columns_covered_by_mothers, "columns_covered_by_mothers") + #print(columns_not_covered, "columns_not_covered") + y_mid = np.append(y_mid, np.ones(len(columns_not_covered) + + len(x_start_without_mother), dtype=int) * top) ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered)) ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) - x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype)) - x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1) + x_starting = np.append(x_starting, np.array(columns_not_covered, int)) + x_starting = np.append(x_starting, x_start_without_mother) + x_ending = np.append(x_ending, np.array(columns_not_covered, int) + 1) + x_ending = np.append(x_ending, x_end_without_mother) - ind_args = np.arange(len(y_mid)) + columns_covered_by_mothers_with_child = set() + for dj in range(len(x_end_with_child_without_mother)): + columns_covered_by_mothers_with_child.update( + range(x_start_with_child_without_mother[dj], + x_end_with_child_without_mother[dj])) + #print(columns_covered_by_mothers_with_child, "columns_covered_by_mothers_with_child") + columns_not_covered_by_mothers_with_child = list( + all_columns - columns_covered_by_mothers_with_child) + #indexes_to_be_spanned=[] + for i_s in range(len(x_end_with_child_without_mother)): + columns_not_covered_by_mothers_with_child.append(x_start_with_child_without_mother[i_s]) + columns_not_covered_by_mothers_with_child = np.sort(columns_not_covered_by_mothers_with_child) + #print(columns_not_covered_by_mothers_with_child, "columns_not_covered_by_mothers_with_child") + ind_args = np.arange(len(y_mid)) + for i_s_nc in columns_not_covered_by_mothers_with_child: + if i_s_nc in x_start_with_child_without_mother: + # use only seps with mother's span ("biggest") + #print("i_s_nc", i_s_nc) + x_end_biggest_column = \ + x_end_with_child_without_mother[ + x_start_with_child_without_mother == i_s_nc][0] + args_all_biggest_seps = \ + ind_args[(x_starting == i_s_nc) & + (x_ending == x_end_biggest_column)] + y_mid_column_nc = y_mid[args_all_biggest_seps] + #print("%d:%d" % (i_s_nc, x_end_biggest_column), "columns covered by mother with child") + #x_start_column_nc = x_starting[args_all_biggest_seps] + #x_end_column_nc = x_ending[args_all_biggest_seps] + y_mid_column_nc = np.sort(y_mid_column_nc) + #print(y_mid_column_nc, "y_mid_column_nc (sorted)") + for nc_top, nc_bot in pairwise(np.append(y_mid_column_nc, bot)): + #print("i_c", i_c) + #print("%d:%d" % (nc_top, nc_bot), "y_mid_column_nc") + ind_all_seps_between_nm_wc = \ + ind_args[(y_mid > nc_top) & + (y_mid < nc_bot) & + (x_starting >= i_s_nc) & + (x_ending <= x_end_biggest_column)] + y_mid_all_between_nm_wc = y_mid[ind_all_seps_between_nm_wc] + x_starting_all_between_nm_wc = x_starting[ind_all_seps_between_nm_wc] + x_ending_all_between_nm_wc = x_ending[ind_all_seps_between_nm_wc] - for column in range(len(peaks_neg_tot)-1): - #print(column,'column') - ind_args_in_col=ind_args[x_starting==column] - #print(len(y_mid)) - y_mid_column=y_mid[ind_args_in_col] - x_start_column=x_starting[ind_args_in_col] - x_end_column=x_ending[ind_args_in_col] + columns_covered_by_mothers = set() + for dj in range(len(ind_all_seps_between_nm_wc)): + columns_covered_by_mothers.update( + range(x_starting_all_between_nm_wc[dj], + x_ending_all_between_nm_wc[dj])) + #print(columns_covered_by_mothers, "columns_covered_by_mothers") + child_columns = set(range(i_s_nc, x_end_biggest_column)) + columns_not_covered = list(child_columns - columns_covered_by_mothers) + #print(child_columns, "child_columns") + #print(columns_not_covered, "columns_not_covered") - ind_args_col_sorted = np.argsort(y_mid_column) - y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) - x_start_by_order.extend(x_start_column[ind_args_col_sorted]) - x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) + if len(ind_all_seps_between_nm_wc): + biggest = np.argmax(x_ending_all_between_nm_wc - + x_starting_all_between_nm_wc) + #print(ind_all_seps_between_nm_wc, "ind_all_seps_between_nm_wc") + #print(biggest, "%d:%d" % (x_starting_all_between_nm_wc[biggest], + x_ending_all_between_nm_wc[biggest]), "biggest") + if columns_covered_by_mothers == set( + range(x_starting_all_between_nm_wc[biggest], + x_ending_all_between_nm_wc[biggest])): + # single biggest accounts for all covered columns alone, + # this separator should be extended to cover all + seps_too_close_to_top_separator = \ + ((y_mid_all_between_nm_wc > nc_top) & + (y_mid_all_between_nm_wc <= nc_top + 500)) + if (np.count_nonzero(seps_too_close_to_top_separator) and + np.count_nonzero(seps_too_close_to_top_separator) < + len(ind_all_seps_between_nm_wc)): + #print(seps_too_close_to_top_separator, "seps_too_close_to_top_separator") + y_mid_all_between_nm_wc = \ + y_mid_all_between_nm_wc[~seps_too_close_to_top_separator] + x_starting_all_between_nm_wc = \ + x_starting_all_between_nm_wc[~seps_too_close_to_top_separator] + x_ending_all_between_nm_wc = \ + x_ending_all_between_nm_wc[~seps_too_close_to_top_separator] + + y_mid_all_between_nm_wc = np.append( + y_mid_all_between_nm_wc, nc_top) + x_starting_all_between_nm_wc = np.append( + x_starting_all_between_nm_wc, i_s_nc) + x_ending_all_between_nm_wc = np.append( + x_ending_all_between_nm_wc, x_end_biggest_column) + else: + y_mid_all_between_nm_wc = np.append( + y_mid_all_between_nm_wc, nc_top) + x_starting_all_between_nm_wc = np.append( + x_starting_all_between_nm_wc, x_starting_all_between_nm_wc[biggest]) + x_ending_all_between_nm_wc = np.append( + x_ending_all_between_nm_wc, x_ending_all_between_nm_wc[biggest]) + + if len(columns_not_covered): + y_mid_all_between_nm_wc = np.append( + y_mid_all_between_nm_wc, [nc_top] * len(columns_not_covered)) + x_starting_all_between_nm_wc = np.append( + x_starting_all_between_nm_wc, np.array(columns_not_covered, int)) + x_ending_all_between_nm_wc = np.append( + x_ending_all_between_nm_wc, np.array(columns_not_covered, int) + 1) + + ind_args_between=np.arange(len(x_ending_all_between_nm_wc)) + for column in range(int(i_s_nc), int(x_end_biggest_column)): + ind_args_in_col=ind_args_between[x_starting_all_between_nm_wc==column] + #print('babali2') + #print(ind_args_in_col,'ind_args_in_col') + #print(len(y_mid)) + y_mid_column=y_mid_all_between_nm_wc[ind_args_in_col] + x_start_column=x_starting_all_between_nm_wc[ind_args_in_col] + x_end_column=x_ending_all_between_nm_wc[ind_args_in_col] + #print('babali3') + ind_args_col_sorted=np.argsort(y_mid_column) + y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) + x_start_by_order.extend(x_start_column[ind_args_col_sorted]) + x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) + else: + #print(i_s_nc,'column not covered by mothers with child') + ind_args_in_col=ind_args[x_starting==i_s_nc] + #print('babali2') + #print(ind_args_in_col,'ind_args_in_col') + #print(len(y_mid)) + y_mid_column=y_mid[ind_args_in_col] + x_start_column=x_starting[ind_args_in_col] + x_end_column=x_ending[ind_args_in_col] + #print('babali3') + ind_args_col_sorted = np.argsort(y_mid_column) + y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) + x_start_by_order.extend(x_start_column[ind_args_col_sorted]) + x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) # create single-column boxes from multi-column separators y_mid_by_order = np.array(y_mid_by_order) @@ -2109,23 +2030,101 @@ def return_boxes_of_images_by_order_of_reading_new( for il in range(len(y_mid_by_order)): #print(il, "il") y_mid_itself = y_mid_by_order[il] - #print(y_mid_itself,'y_mid_itself') x_start_itself = x_start_by_order[il] x_end_itself = x_end_by_order[il] - for column in range(x_start_itself, x_end_itself+1): + for column in range(int(x_start_itself), int(x_end_itself)+1): #print(column,'cols') + #print('burda') #print('burda2') y_mid_next = y_mid_by_order[(y_mid_itself < y_mid_by_order) & (column >= x_start_by_order) & (column <= x_end_by_order)] - #print(y_mid_next,'y_mid_next') y_mid_next = y_mid_next.min(initial=bot) #print(y_mid_next,'y_mid_next') + #print(y_mid_itself,'y_mid_itself') boxes.append([peaks_neg_tot[column], peaks_neg_tot[column+1], y_mid_itself, y_mid_next]) - # dbg_plt(boxes[-1], "B column %d box" % (column + 1)) + # dbg_plt(boxes[-1], "A column %d box" % (column + 1)) + except: + logger.exception("cannot assign boxes") + boxes.append([0, peaks_neg_tot[len(peaks_neg_tot)-1], + top, bot]) + # dbg_plt(boxes[-1], "fallback box") + else: + # order multi-column separators + y_mid_by_order=[] + x_start_by_order=[] + x_end_by_order=[] + if len(x_starting)>0: + columns_covered_by_seps_covered_more_than_2col = set() + for dj in range(len(x_starting)): + if set(range(x_starting[dj], x_ending[dj])) != all_columns: + columns_covered_by_seps_covered_more_than_2col.update( + range(x_starting[dj], x_ending[dj])) + columns_not_covered = list(all_columns - columns_covered_by_seps_covered_more_than_2col) + + y_mid = np.append(y_mid, np.ones(len(columns_not_covered) + 1, + dtype=int) * top) + ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered)) + ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) + x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype)) + x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1) + if len(new_main_sep_y) > 0: + x_starting = np.append(x_starting, 0) + x_ending = np.append(x_ending, len(peaks_neg_tot) - 1) + else: + x_starting = np.append(x_starting, x_starting[0]) + x_ending = np.append(x_ending, x_ending[0]) + else: + columns_not_covered = list(all_columns) + y_mid = np.append(y_mid, np.ones(len(columns_not_covered), + dtype=int) * top) + ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered)) + ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) + x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype)) + x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1) + + ind_args = np.arange(len(y_mid)) + + for column in range(len(peaks_neg_tot)-1): + #print(column,'column') + ind_args_in_col=ind_args[x_starting==column] + #print(len(y_mid)) + y_mid_column=y_mid[ind_args_in_col] + x_start_column=x_starting[ind_args_in_col] + x_end_column=x_ending[ind_args_in_col] + + ind_args_col_sorted = np.argsort(y_mid_column) + y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) + x_start_by_order.extend(x_start_column[ind_args_col_sorted]) + x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) + + # create single-column boxes from multi-column separators + y_mid_by_order = np.array(y_mid_by_order) + x_start_by_order = np.array(x_start_by_order) + x_end_by_order = np.array(x_end_by_order) + for il in range(len(y_mid_by_order)): + #print(il, "il") + y_mid_itself = y_mid_by_order[il] + #print(y_mid_itself,'y_mid_itself') + x_start_itself = x_start_by_order[il] + x_end_itself = x_end_by_order[il] + for column in range(x_start_itself, x_end_itself+1): + #print(column,'cols') + #print('burda2') + y_mid_next = y_mid_by_order[(y_mid_itself < y_mid_by_order) & + (column >= x_start_by_order) & + (column <= x_end_by_order)] + #print(y_mid_next,'y_mid_next') + y_mid_next = y_mid_next.min(initial=bot) + #print(y_mid_next,'y_mid_next') + boxes.append([peaks_neg_tot[column], + peaks_neg_tot[column+1], + y_mid_itself, + y_mid_next]) + # dbg_plt(boxes[-1], "B column %d box" % (column + 1)) if right2left_readingorder: peaks_neg_tot_tables_new = [] From a2a9fe51175cfd11bc62d1e917bf79b299a7846e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 24 Oct 2025 02:35:04 +0200 Subject: [PATCH 019/118] `delete_separator_around`: simplify, eynollah: identifiers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - use array instead of list operations - rename identifiers: - `pixel` → `label` - `line` → `sep` --- src/eynollah/eynollah.py | 104 ++++++++++++++++++--------------------- 1 file changed, 47 insertions(+), 57 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 08ffed7..eee3777 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -2669,45 +2669,35 @@ class Eynollah: return layout_org, contours_new - def delete_separator_around(self, spliter_y,peaks_neg,image_by_region, pixel_line, pixel_table): + def delete_separator_around(self, splitter_y, peaks_neg, image_by_region, label_seps, label_table): # format of subboxes: box=[x1, x2 , y1, y2] pix_del = 100 - if len(image_by_region.shape)==3: - for i in range(len(spliter_y)-1): - for j in range(1,len(peaks_neg[i])-1): - ys = slice(int(spliter_y[i]), - int(spliter_y[i+1])) - xs = slice(peaks_neg[i][j] - pix_del, - peaks_neg[i][j] + pix_del) - image_by_region[ys,xs,0][image_by_region[ys,xs,0]==pixel_line] = 0 - image_by_region[ys,xs,0][image_by_region[ys,xs,1]==pixel_line] = 0 - image_by_region[ys,xs,0][image_by_region[ys,xs,2]==pixel_line] = 0 - - image_by_region[ys,xs,0][image_by_region[ys,xs,0]==pixel_table] = 0 - image_by_region[ys,xs,0][image_by_region[ys,xs,1]==pixel_table] = 0 - image_by_region[ys,xs,0][image_by_region[ys,xs,2]==pixel_table] = 0 - else: - for i in range(len(spliter_y)-1): - for j in range(1,len(peaks_neg[i])-1): - ys = slice(int(spliter_y[i]), - int(spliter_y[i+1])) - xs = slice(peaks_neg[i][j] - pix_del, - peaks_neg[i][j] + pix_del) - image_by_region[ys,xs][image_by_region[ys,xs]==pixel_line] = 0 - image_by_region[ys,xs][image_by_region[ys,xs]==pixel_table] = 0 + for i in range(len(splitter_y)-1): + for j in range(1,len(peaks_neg[i])-1): + where = np.index_exp[splitter_y[i]: + splitter_y[i+1], + peaks_neg[i][j] - pix_del: + peaks_neg[i][j] + pix_del, + :] + if image_by_region.ndim < 3: + where = where[:2] + else: + print("image_by_region ndim is 3!") # rs + image_by_region[where][image_by_region[where] == label_seps] = 0 + image_by_region[where][image_by_region[where] == label_table] = 0 return image_by_region def add_tables_heuristic_to_layout( self, image_regions_eraly_p, boxes, - slope_mean_hor, spliter_y, peaks_neg_tot, image_revised, - num_col_classifier, min_area, pixel_line): + slope_mean_hor, splitter_y, peaks_neg_tot, image_revised, + num_col_classifier, min_area, label_seps): - pixel_table =10 - image_revised_1 = self.delete_separator_around(spliter_y, peaks_neg_tot, image_revised, pixel_line, pixel_table) + label_table =10 + image_revised_1 = self.delete_separator_around(splitter_y, peaks_neg_tot, image_revised, label_seps, label_table) try: - image_revised_1[:,:30][image_revised_1[:,:30]==pixel_line] = 0 - image_revised_1[:,-30:][image_revised_1[:,-30:]==pixel_line] = 0 + image_revised_1[:,:30][image_revised_1[:,:30]==label_seps] = 0 + image_revised_1[:,-30:][image_revised_1[:,-30:]==label_seps] = 0 except: pass boxes = np.array(boxes, dtype=int) # to be on the safe side @@ -2718,7 +2708,7 @@ class Eynollah: _, thresh = cv2.threshold(image_col, 0, 255, 0) contours,hirarchy=cv2.findContours(thresh.copy(), cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE) - if indiv==pixel_table: + if indiv==label_table: main_contours = filter_contours_area_of_image_tables(thresh, contours, hirarchy, max_area=1, min_area=0.001) else: @@ -2734,11 +2724,11 @@ class Eynollah: box_xs = slice(*boxes[i][0:2]) image_box = img_comm[box_ys, box_xs] try: - image_box_tabels_1 = (image_box == pixel_table) * 1 + image_box_tabels_1 = (image_box == label_table) * 1 contours_tab,_=return_contours_of_image(image_box_tabels_1) contours_tab=filter_contours_area_of_image_tables(image_box_tabels_1,contours_tab,_,1,0.003) - image_box_tabels_1 = (image_box == pixel_line).astype(np.uint8) * 1 - image_box_tabels_and_m_text = ( (image_box == pixel_table) | + image_box_tabels_1 = (image_box == label_seps).astype(np.uint8) * 1 + image_box_tabels_and_m_text = ( (image_box == label_table) | (image_box == 1) ).astype(np.uint8) * 1 image_box_tabels_1 = cv2.dilate(image_box_tabels_1, KERNEL, iterations=5) @@ -2800,7 +2790,7 @@ class Eynollah: y_up_tabs=[] for ii in range(len(y_up_tabs)): - image_box[y_up_tabs[ii]:y_down_tabs[ii]] = pixel_table + image_box[y_up_tabs[ii]:y_down_tabs[ii]] = label_table image_revised_last[box_ys, box_xs] = image_box else: @@ -2811,14 +2801,14 @@ class Eynollah: image_revised_last[box_ys, box_xs] = image_box if num_col_classifier==1: - img_tables_col_1 = (image_revised_last == pixel_table).astype(np.uint8) + img_tables_col_1 = (image_revised_last == label_table).astype(np.uint8) contours_table_col1, _ = return_contours_of_image(img_tables_col_1) _,_ ,_ , _, y_min_tab_col1 ,y_max_tab_col1, _= find_new_features_of_contours(contours_table_col1) if len(y_min_tab_col1)>0: for ijv in range(len(y_min_tab_col1)): - image_revised_last[int(y_min_tab_col1[ijv]):int(y_max_tab_col1[ijv])] = pixel_table + image_revised_last[int(y_min_tab_col1[ijv]):int(y_max_tab_col1[ijv])] = label_table return image_revised_last def get_tables_from_model(self, img, num_col_classifier): @@ -3153,14 +3143,14 @@ class Eynollah: text_regions_p_1_n = None textline_mask_tot_d = None regions_without_separators_d = None - pixel_lines = 3 + label_seps = 3 if np.abs(slope_deskew) < SLOPE_THRESHOLD: - _, _, matrix_of_lines_ch, splitter_y_new, _ = find_number_of_columns_in_document( - text_regions_p, num_col_classifier, self.tables, pixel_lines) + _, _, matrix_of_seps_ch, splitter_y_new, _ = find_number_of_columns_in_document( + text_regions_p, num_col_classifier, self.tables, label_seps) if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - _, _, matrix_of_lines_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( - text_regions_p_1_n, num_col_classifier, self.tables, pixel_lines) + _, _, matrix_of_seps_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( + text_regions_p_1_n, num_col_classifier, self.tables, label_seps) #print(time.time()-t_0_box,'time box in 2') self.logger.info("num_col_classifier: %s", num_col_classifier) @@ -3175,7 +3165,7 @@ class Eynollah: t1 = time.time() if np.abs(slope_deskew) < SLOPE_THRESHOLD: boxes, peaks_neg_tot_tables = return_boxes_of_images_by_order_of_reading_new( - splitter_y_new, regions_without_separators, matrix_of_lines_ch, + splitter_y_new, regions_without_separators, matrix_of_seps_ch, num_col_classifier, erosion_hurts, self.tables, self.right2left) boxes_d = None self.logger.debug("len(boxes): %s", len(boxes)) @@ -3187,17 +3177,17 @@ class Eynollah: else: text_regions_p_tables = np.copy(text_regions_p) text_regions_p_tables[(table_prediction == 1)] = 10 - pixel_line = 3 + label_seps = 3 img_revised_tab2 = self.add_tables_heuristic_to_layout( text_regions_p_tables, boxes, 0, splitter_y_new, peaks_neg_tot_tables, text_regions_p_tables, - num_col_classifier , 0.000005, pixel_line) + num_col_classifier , 0.000005, label_seps) #print(time.time()-t_0_box,'time box in 3.2') img_revised_tab2, contoures_tables = self.check_iou_of_bounding_box_and_contour_for_tables( img_revised_tab2, table_prediction, 10, num_col_classifier) #print(time.time()-t_0_box,'time box in 3.3') else: boxes_d, peaks_neg_tot_tables_d = return_boxes_of_images_by_order_of_reading_new( - splitter_y_new_d, regions_without_separators_d, matrix_of_lines_ch_d, + splitter_y_new_d, regions_without_separators_d, matrix_of_seps_ch_d, num_col_classifier, erosion_hurts, self.tables, self.right2left) boxes = None self.logger.debug("len(boxes): %s", len(boxes_d)) @@ -3210,11 +3200,11 @@ class Eynollah: text_regions_p_tables = np.round(text_regions_p_tables) text_regions_p_tables[(text_regions_p_tables != 3) & (table_prediction_n == 1)] = 10 - pixel_line = 3 + label_seps = 3 img_revised_tab2 = self.add_tables_heuristic_to_layout( text_regions_p_tables, boxes_d, 0, splitter_y_new_d, peaks_neg_tot_tables_d, text_regions_p_tables, - num_col_classifier, 0.000005, pixel_line) + num_col_classifier, 0.000005, label_seps) img_revised_tab2_d,_ = self.check_iou_of_bounding_box_and_contour_for_tables( img_revised_tab2, table_prediction_n, 10, num_col_classifier) @@ -3333,14 +3323,14 @@ class Eynollah: regions_without_separators = (text_regions_p[:,:] == 1)*1 regions_without_separators[table_prediction == 1] = 1 - pixel_lines=3 + label_seps=3 if np.abs(slope_deskew) < SLOPE_THRESHOLD: num_col, _, matrix_of_lines_ch, splitter_y_new, _ = find_number_of_columns_in_document( - text_regions_p, num_col_classifier, self.tables, pixel_lines) + text_regions_p, num_col_classifier, self.tables, label_seps) if np.abs(slope_deskew) >= SLOPE_THRESHOLD: num_col_d, _, matrix_of_lines_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( - text_regions_p_1_n, num_col_classifier, self.tables, pixel_lines) + text_regions_p_1_n, num_col_classifier, self.tables, label_seps) if num_col_classifier>=3: if np.abs(slope_deskew) < SLOPE_THRESHOLD: @@ -3359,10 +3349,10 @@ class Eynollah: num_col_classifier, erosion_hurts, self.tables, self.right2left) text_regions_p_tables = np.copy(text_regions_p) text_regions_p_tables[:,:][(table_prediction[:,:]==1)] = 10 - pixel_line = 3 + label_seps = 3 img_revised_tab2 = self.add_tables_heuristic_to_layout( text_regions_p_tables, boxes, 0, splitter_y_new, peaks_neg_tot_tables, text_regions_p_tables, - num_col_classifier , 0.000005, pixel_line) + num_col_classifier , 0.000005, label_seps) img_revised_tab2,contoures_tables = self.check_iou_of_bounding_box_and_contour_for_tables( img_revised_tab2, table_prediction, 10, num_col_classifier) @@ -3374,11 +3364,11 @@ class Eynollah: text_regions_p_tables = np.round(text_regions_p_tables) text_regions_p_tables[(text_regions_p_tables != 3) & (table_prediction_n == 1)] = 10 - pixel_line = 3 + label_seps = 3 img_revised_tab2 = self.add_tables_heuristic_to_layout( text_regions_p_tables, boxes_d, 0, splitter_y_new_d, peaks_neg_tot_tables_d, text_regions_p_tables, - num_col_classifier, 0.000005, pixel_line) + num_col_classifier, 0.000005, label_seps) img_revised_tab2_d,_ = self.check_iou_of_bounding_box_and_contour_for_tables( img_revised_tab2, table_prediction_n, 10, num_col_classifier) @@ -4721,12 +4711,12 @@ class Eynollah: regions_without_separators_d = cv2.erode(regions_without_separators_d[:, :], KERNEL, iterations=6) if np.abs(slope_deskew) < SLOPE_THRESHOLD: - boxes, peaks_neg_tot_tables = return_boxes_of_images_by_order_of_reading_new( + boxes, _ = return_boxes_of_images_by_order_of_reading_new( splitter_y_new, regions_without_separators, matrix_of_lines_ch, num_col_classifier, erosion_hurts, self.tables, self.right2left, logger=self.logger) else: - boxes_d, peaks_neg_tot_tables_d = return_boxes_of_images_by_order_of_reading_new( + boxes_d, _ = return_boxes_of_images_by_order_of_reading_new( splitter_y_new_d, regions_without_separators_d, matrix_of_lines_ch_d, num_col_classifier, erosion_hurts, self.tables, self.right2left, logger=self.logger) From 3367462d181bca16316e84957299e0abb08ec0d8 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 24 Oct 2025 22:46:46 +0200 Subject: [PATCH 020/118] `return_boxes_of_images_by_order_of_reading_new`: change arg order --- src/eynollah/utils/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index f30d55e..a163fad 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -33,7 +33,7 @@ def pairwise(iterable): a = b def return_x_start_end_mothers_childs_and_type_of_reading_order( - x_min_hor_some, x_max_hor_some, cy_hor_some, peak_points, y_max_hor_some): + peak_points, x_min_hor_some, x_max_hor_some, cy_hor_some, y_max_hor_some): """ Analyse which separators overlap multiple column candidates, and how they overlap each other. @@ -54,10 +54,10 @@ def return_x_start_end_mothers_childs_and_type_of_reading_order( with no mother, specifically (and thus, no simple box separation is possible). Arguments: + * the x column coordinates * the x start column index of the raw separators * the x end column index of the raw separators * the y center coordinate of the raw separators - * the x column coordinates * the y end coordinate of the raw separators Returns: @@ -1736,7 +1736,7 @@ def return_boxes_of_images_by_order_of_reading_new( there_is_sep_with_child, \ y_mid_with_child_without_mother, x_start_with_child_without_mother, x_end_with_child_without_mother, \ new_main_sep_y = return_x_start_end_mothers_childs_and_type_of_reading_order( - x_min_hor_some, x_max_hor_some, cy_hor_some, peaks_neg_tot, y_max_hor_some) + peaks_neg_tot, x_min_hor_some, x_max_hor_some, cy_hor_some, y_max_hor_some) # show multi-column separators # dbg_plt([0, None, top, bot], "multi-column separators in current split", From 19b2c3fa424f8750e093a2fb88d7e6e381daeaab Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 24 Oct 2025 22:51:19 +0200 Subject: [PATCH 021/118] reading order: improve handling of headings and horizontal seps - drop connected components analysis to test overlaps between horizontal separators and (horizontal) neighbours (introduced in ab17a927) - instead of converting headings to topline and baseline during `find_number_of_columns_in_document` (introduced in 9f1595d7), add them to the matrix unchanged, but mark as extra type (besides horizontal and vertical separtors) - convert headings to toplines and baselines no earlier than in `return_boxes_of_images_by_order_of_reading_new` - for both headings and horizontal separators, if they already span multiple columns, check if they would overlap (horizontal) neighbours by looking at successively larger (left and right) intervals of columns (and pick the largest elongation which does not introduce any overlaps) --- src/eynollah/utils/__init__.py | 127 +++++++++++++++++++++------------ 1 file changed, 80 insertions(+), 47 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index a163fad..f3dbae2 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1387,8 +1387,6 @@ def return_points_with_boundies(peaks_neg_fin, first_point, last_point): return peaks_neg_tot def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, label_seps, contours_h=None): - ncomps, ccomps = cv2.connectedComponents(region_pre_p.astype(np.uint8)) - separators_closeup = 1 * (region_pre_p == label_seps) separators_closeup[0:110] = 0 separators_closeup[-150:] = 0 @@ -1414,14 +1412,6 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, dist_ye = max_ye - min_ye if dist_ye <= 50 and dist_xe >= 3 * dist_ye: cnts_hor_e.append(cnt) - labels = np.setdiff1d(np.unique(ccomps[med_ye]), [0]) - if len(labels) == 1: - # mid line does not intersect with any other region - # so add it as extra splitter line - cnts_hor_e.append(np.array([[[0, med_ye]], - [[ccomps.shape[1], med_ye]], - [[ccomps.shape[1], med_ye + 1]], - [[0, med_ye + 1]]])) # delete horizontal contours (leaving only the edges) separators_closeup_n_binary = cv2.fillPoly(separators_closeup_n_binary, pts=cnts_hor_e, color=0) @@ -1493,7 +1483,7 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, slope_seps_org_hor=slope_seps_org_hor[dist_x_hor>=len_x/2.0] dist_x_hor=dist_x_hor[dist_x_hor>=len_x/2.0] - matrix_of_seps_ch=np.zeros((len(cy_seps_hor)+len(cx_seps_ver),10)) + matrix_of_seps_ch = np.zeros((len(cy_seps_hor)+len(cx_seps_ver), 10), dtype=int) matrix_of_seps_ch[:len(cy_seps_hor),0]=args_hor matrix_of_seps_ch[len(cy_seps_hor):,0]=args_ver matrix_of_seps_ch[len(cy_seps_hor):,1]=cx_seps_ver @@ -1515,34 +1505,17 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, if contours_h is not None: _, dist_x_head, x_min_head, x_max_head, cy_head, _, y_min_head, y_max_head, _ = \ find_features_of_lines(contours_h) - # matrix_l_n = np.zeros((len(cy_head), matrix_of_seps_ch.shape[1])) - # args_head = np.arange(len(cy_head)) - # matrix_l_n[:, 0] = args_head - # matrix_l_n[:, 2] = x_min_head+30 - # matrix_l_n[:, 3] = x_max_head-30 - # matrix_l_n[:, 4] = dist_x_head - # matrix_l_n[:, 5] = y_min_head-3-8 - # matrix_l_n[:, 6] = y_min_head-5-8 - # matrix_l_n[:, 7] = y_max_head#y_min_head+1-8 - # matrix_l_n[:, 8] = 4 - # split at toplines (y_min_head) and baselines (y_max_head) instead of center (cy_head): - cy_head = np.stack((y_min_head, y_max_head)).T.flatten() - y_min_head, y_max_head = (np.stack((y_min_head - 2, y_max_head - 2)).T.flatten(), - np.stack((y_min_head + 2, y_max_head + 2)).T.flatten()) - x_min_head = np.repeat(x_min_head, 2) - x_max_head = np.repeat(x_max_head, 2) - dist_x_head = np.repeat(dist_x_head, 2) - matrix_l_n = np.zeros((len(cy_head), matrix_of_seps_ch.shape[1])) + matrix_l_n = np.zeros((len(cy_head), matrix_of_seps_ch.shape[1]), dtype=int) args_head = np.arange(len(cy_head)) matrix_l_n[:, 0] = args_head - # +/- 30px to avoid crossing col peaks by accident - matrix_l_n[:, 2] = x_min_head + 30 - matrix_l_n[:, 3] = x_max_head - 30 + matrix_l_n[:, 2] = x_min_head + matrix_l_n[:, 3] = x_max_head matrix_l_n[:, 4] = dist_x_head matrix_l_n[:, 5] = cy_head matrix_l_n[:, 6] = y_min_head matrix_l_n[:, 7] = y_max_head - matrix_l_n[:, 8] = 4 + matrix_l_n[:, 8] = y_max_head - y_min_head + matrix_l_n[:, 9] = 2 # mark as heading (so it can be split into 2 horizontal separators as needed) matrix_of_seps_ch = np.append( matrix_of_seps_ch, matrix_l_n, axis=0) @@ -1551,9 +1524,12 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, cy_seps_splitters = np.append(cy_seps_splitters, special_separators) if contours_h is not None: - cy_seps_splitters_head=cy_head[(x_min_head<=.16*region_pre_p.shape[1]) & - (x_max_head>=.84*region_pre_p.shape[1])] - cy_seps_splitters = np.append(cy_seps_splitters, cy_seps_splitters_head) + y_min_splitters_head = y_min_head[(x_min_head<=.16*region_pre_p.shape[1]) & + (x_max_head>=.84*region_pre_p.shape[1])] + y_max_splitters_head = y_max_head[(x_min_head<=.16*region_pre_p.shape[1]) & + (x_max_head>=.84*region_pre_p.shape[1])] + cy_seps_splitters = np.append(cy_seps_splitters, y_min_splitters_head) + cy_seps_splitters = np.append(cy_seps_splitters, y_max_splitters_head) cy_seps_splitters = np.sort(cy_seps_splitters).astype(int) splitter_y_new = [0] + list(cy_seps_splitters) + [region_pre_p.shape[0]] @@ -1713,17 +1689,6 @@ def return_boxes_of_images_by_order_of_reading_new( #num_col, peaks_neg_fin = find_num_col( # regions_without_separators[top:bot,:], # multiplier=7.0) - x_min_hor_some=matrix_new[:,2][ (matrix_new[:,9]==0) ] - x_max_hor_some=matrix_new[:,3][ (matrix_new[:,9]==0) ] - cy_hor_some=matrix_new[:,5][ (matrix_new[:,9]==0) ] - y_max_hor_some=matrix_new[:,7][ (matrix_new[:,9]==0) ] - - if right2left_readingorder: - x_max_hor_some_new = width_tot - x_min_hor_some - x_min_hor_some_new = width_tot - x_max_hor_some - x_min_hor_some =list(np.copy(x_min_hor_some_new)) - x_max_hor_some =list(np.copy(x_max_hor_some_new)) - peaks_neg_tot = np.array([0] + peaks_neg_fin + [width_tot]) #print(peaks_neg_tot,'peaks_neg_tot') peaks_neg_tot_tables.append(peaks_neg_tot) @@ -1731,6 +1696,74 @@ def return_boxes_of_images_by_order_of_reading_new( all_columns = set(range(len(peaks_neg_tot) - 1)) #print("all_columns", all_columns) + # elongate horizontal separators+headings as much as possible without overlap + args_nonver = matrix_new[:, 9] != 1 + regions_with_separators = np.copy(regions_without_separators[top:bot]) + for xmin, xmax, ymin, ymax in matrix_new[:, [2, 3, 6, 7]]: + regions_with_separators[ymin - top: ymax - top, xmin: xmax] = 6 + # def dbg_imshow(box, title): + # xmin, xmax, ymin, ymax = box + # plt.imshow(regions_with_separators, extent=[0, width_tot, bot, top]) + # plt.gca().add_patch(patches.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, + # fill=False, linewidth=1, edgecolor='r')) + # plt.title(title + " at %d:%d, %d:%d" % (ymin, ymax, xmin, xmax)) + # plt.show() + for i in np.flatnonzero(args_nonver): + xmin, xmax, ymin, ymax, typ = matrix_new[i, [2, 3, 6, 7, 9]] + cut = regions_with_separators[ymin - top: ymax - top] + # dbg_imshow([xmin, xmax, ymin, ymax], "separator %d (%s)" % (i, "heading" if typ else "horizontal")) + starting = xmin - peaks_neg_tot + min_start = np.flatnonzero(starting >= 0)[-1] # last left-of + ending = xmax - peaks_neg_tot + max_end = np.flatnonzero(ending < 0)[0] # first right-of + # skip elongation unless this is already a multi-column separator/heading: + if not max_end - min_start > 1: + continue + # is there anything left of min_start? + for j in range(min_start): + # dbg_imshow([peaks_neg_tot[j], xmin, ymin, ymax], "start of %d candidate %d" % (i, j)) + if not np.any(cut[:, peaks_neg_tot[j]: xmin]): + # print("elongated sep", i, "typ", typ, "start", xmin, "to", j, peaks_neg_tot[j]) + matrix_new[i, 2] = peaks_neg_tot[j] + 1 # elongate to start of this column + break + # is there anything right of max_end? + for j in range(len(peaks_neg_tot) - 1, max_end, -1): + # dbg_imshow([xmax, peaks_neg_tot[j], ymin, ymax], "end of %d candidate %d" % (i, j)) + if not np.any(cut[:, xmax: peaks_neg_tot[j]]): + # print("elongated sep", i, "typ", typ, "end", xmax, "to", j, peaks_neg_tot[j]) + matrix_new[i, 3] = peaks_neg_tot[j] - 1 # elongate to end of this column + break + + args_hor = matrix_new[:, 9] == 0 + x_min_hor_some = matrix_new[:, 2][args_hor] + x_max_hor_some = matrix_new[:, 3][args_hor] + y_max_hor_some = matrix_new[:, 7][args_hor] + cy_hor_some = matrix_new[:, 5][args_hor] + + args_head = matrix_new[:, 9] == 2 + x_min_hor_head = matrix_new[:, 2][args_head] + x_max_hor_head = matrix_new[:, 3][args_head] + y_min_hor_head = matrix_new[:, 6][args_head] + y_max_hor_head = matrix_new[:, 7][args_head] + cy_hor_head = matrix_new[:, 5][args_head] + + # split headings at toplines (y_min_head) and baselines (y_max_head) + # instead of merely adding their center (cy_head) as horizontal separator + # (x +/- 30px to avoid crossing col peaks by accident) + x_min_hor_some = np.append(x_min_hor_some, np.tile(x_min_hor_head + 30, 2)) + x_max_hor_some = np.append(x_max_hor_some, np.tile(x_max_hor_head - 30, 2)) + y_max_hor_some = np.append(y_max_hor_some, # baselines + np.concatenate((y_min_hor_head + 2, + y_max_hor_head + 2))) + cy_hor_some = np.append(cy_hor_some, # toplines + np.concatenate((y_min_hor_head - 2, + y_max_hor_head - 2))) + + if right2left_readingorder: + x_max_hor_some = width_tot - x_min_hor_some + x_min_hor_some = width_tot - x_max_hor_some + + reading_order_type, x_starting, x_ending, y_mid, y_max, \ y_mid_without_mother, x_start_without_mother, x_end_without_mother, \ there_is_sep_with_child, \ From 1a76ce177dba69aa711b74e6c69022e4a5ebf27f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 14 Nov 2025 02:07:20 +0100 Subject: [PATCH 022/118] do_order_of_regions: round contour centers (so we can be sure they do not fall through the "pixel cracks": bboxes are delimited by integers, and we do not want to assign contours between boxes) --- src/eynollah/eynollah.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index eee3777..35b0a37 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -2491,11 +2491,15 @@ class Eynollah: contours_only_text_parent) cx_head, cy_head, mx_head, Mx_head, my_head, My_head, mxy_head = find_new_features_of_contours( contours_only_text_parent_h) + cx_main = np.array(cx_main, dtype=int) + cy_main = np.array(cy_main, dtype=int) + cx_head = np.array(cx_head, dtype=int) + cy_head = np.array(cy_head, dtype=int) def match_boxes(only_centers: bool): arg_text_con_main = np.zeros(len(contours_only_text_parent), dtype=int) for ii in range(len(contours_only_text_parent)): - check_if_textregion_located_in_a_box = False + box_found = False for jj, box in enumerate(boxes): if ((cx_main[ii] >= box[0] and cx_main[ii] < box[1] and @@ -2506,22 +2510,23 @@ class Eynollah: my_main[ii] >= box[2] and My_main[ii] < box[3])): arg_text_con_main[ii] = jj - check_if_textregion_located_in_a_box = True - #print("main/matched", (mx_main[ii], Mx_main[ii], my_main[ii], My_main[ii]), "\tin", box, only_centers) + box_found = True + # print("main/matched ", ii, "\t", (mx_main[ii], Mx_main[ii], my_main[ii], My_main[ii]), "\tin", jj, box, only_centers) break - if not check_if_textregion_located_in_a_box: + if not box_found: dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_main[ii]], [cx_main[ii]]]), axis=0) pcontained_in_box = ((boxes[:, 2] <= cy_main[ii]) & (cy_main[ii] < boxes[:, 3]) & (boxes[:, 0] <= cx_main[ii]) & (cx_main[ii] < boxes[:, 1])) + assert pcontained_in_box.any(), (ii, cx_main[ii], cy_main[ii]) ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) arg_text_con_main[ii] = ind_min - #print("main/fallback", (mx_main[ii], Mx_main[ii], my_main[ii], My_main[ii]), "\tin", boxes[ind_min], only_centers) + # print("main/fallback ", ii, "\t", (mx_main[ii], Mx_main[ii], my_main[ii], My_main[ii]), "\tin", ind_min, boxes[ind_min], only_centers) args_contours_main = np.arange(len(contours_only_text_parent)) order_by_con_main = np.zeros_like(arg_text_con_main) arg_text_con_head = np.zeros(len(contours_only_text_parent_h), dtype=int) for ii in range(len(contours_only_text_parent_h)): - check_if_textregion_located_in_a_box = False + box_found = False for jj, box in enumerate(boxes): if ((cx_head[ii] >= box[0] and cx_head[ii] < box[1] and @@ -2532,16 +2537,17 @@ class Eynollah: my_head[ii] >= box[2] and My_head[ii] < box[3])): arg_text_con_head[ii] = jj - check_if_textregion_located_in_a_box = True - #print("head/matched", (mx_head[ii], Mx_head[ii], my_head[ii], My_head[ii]), "\tin", box, only_centers) + box_found = True + # print("head/matched ", ii, "\t", (mx_head[ii], Mx_head[ii], my_head[ii], My_head[ii]), "\tin", jj, box, only_centers) break - if not check_if_textregion_located_in_a_box: + if not box_found: dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_head[ii]], [cx_head[ii]]]), axis=0) pcontained_in_box = ((boxes[:, 2] <= cy_head[ii]) & (cy_head[ii] < boxes[:, 3]) & (boxes[:, 0] <= cx_head[ii]) & (cx_head[ii] < boxes[:, 1])) + assert pcontained_in_box.any(), (ii, cx_head[ii], cy_head[ii]) ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) arg_text_con_head[ii] = ind_min - #print("head/fallback", (mx_head[ii], Mx_head[ii], my_head[ii], My_head[ii]), "\tin", boxes[ind_min], only_centers) + # print("head/fallback ", ii, "\t", (mx_head[ii], Mx_head[ii], my_head[ii], My_head[ii]), "\tin", ind_min, boxes[ind_min], only_centers) args_contours_head = np.arange(len(contours_only_text_parent_h)) order_by_con_head = np.zeros_like(arg_text_con_head) From 95f76081d1de4611d3007ef14a342d7dbb530584 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 14 Nov 2025 02:22:39 +0100 Subject: [PATCH 023/118] rename some more identifiers: MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - `lines` → `seps` (to distinguish from textlines) - `text_regions_p_1_n` → `text_regions_p_d` (because all other deskewed variables are called like this) - `pixel` → `label` --- src/eynollah/eynollah.py | 178 +++++++++++++++++++-------------------- 1 file changed, 89 insertions(+), 89 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 35b0a37..2bdb2c7 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -2091,19 +2091,19 @@ class Eynollah: prediction_regions_org = prediction_regions_org[page_coord[0] : page_coord[1], page_coord[2] : page_coord[3]] prediction_regions_org=prediction_regions_org[:,:,0] - mask_lines_only = (prediction_regions_org[:,:] ==3)*1 + mask_seps_only = (prediction_regions_org[:,:] == 3)*1 mask_texts_only = (prediction_regions_org[:,:] ==1)*1 mask_images_only=(prediction_regions_org[:,:] ==2)*1 - polygons_seplines, hir_seplines = return_contours_of_image(mask_lines_only) + polygons_seplines, hir_seplines = return_contours_of_image(mask_seps_only) polygons_seplines = filter_contours_area_of_image( - mask_lines_only, polygons_seplines, hir_seplines, max_area=1, min_area=0.00001, dilate=1) + mask_seps_only, polygons_seplines, hir_seplines, max_area=1, min_area=0.00001, dilate=1) polygons_of_only_texts = return_contours_of_interested_region(mask_texts_only,1,0.00001) - polygons_of_only_lines = return_contours_of_interested_region(mask_lines_only,1,0.00001) + polygons_of_only_seps = return_contours_of_interested_region(mask_seps_only,1,0.00001) text_regions_p_true = np.zeros(prediction_regions_org.shape) - text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts = polygons_of_only_lines, color=(3,3,3)) + text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts = polygons_of_only_seps, color=(3,3,3)) text_regions_p_true[:,:][mask_images_only[:,:] == 1] = 2 text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts=polygons_of_only_texts, color=(1,1,1)) @@ -2282,7 +2282,7 @@ class Eynollah: img_bin = resize_image(img_bin, img_height_h, img_width_h ) prediction_regions_org=prediction_regions_org[:,:,0] - mask_lines_only = (prediction_regions_org[:,:] ==3)*1 + mask_seps_only = (prediction_regions_org[:,:] == 3)*1 mask_texts_only = (prediction_regions_org[:,:] ==1)*1 mask_texts_only = mask_texts_only.astype('uint8') @@ -2293,7 +2293,7 @@ class Eynollah: mask_texts_only = cv2.dilate(mask_texts_only, kernel=np.ones((2,2), np.uint8), iterations=1) mask_images_only=(prediction_regions_org[:,:] ==2)*1 - polygons_seplines, hir_seplines = return_contours_of_image(mask_lines_only) + polygons_seplines, hir_seplines = return_contours_of_image(mask_seps_only) test_khat = np.zeros(prediction_regions_org.shape) test_khat = cv2.fillPoly(test_khat, pts=polygons_seplines, color=(1,1,1)) @@ -2307,7 +2307,7 @@ class Eynollah: #plt.show() polygons_seplines = filter_contours_area_of_image( - mask_lines_only, polygons_seplines, hir_seplines, max_area=1, min_area=0.00001, dilate=1) + mask_seps_only, polygons_seplines, hir_seplines, max_area=1, min_area=0.00001, dilate=1) test_khat = np.zeros(prediction_regions_org.shape) test_khat = cv2.fillPoly(test_khat, pts = polygons_seplines, color=(1,1,1)) @@ -2318,10 +2318,10 @@ class Eynollah: polygons_of_only_texts = return_contours_of_interested_region(mask_texts_only,1,0.00001) ##polygons_of_only_texts = dilate_textregion_contours(polygons_of_only_texts) - polygons_of_only_lines = return_contours_of_interested_region(mask_lines_only,1,0.00001) + polygons_of_only_seps = return_contours_of_interested_region(mask_seps_only,1,0.00001) text_regions_p_true = np.zeros(prediction_regions_org.shape) - text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts=polygons_of_only_lines, color=(3,3,3)) + text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts=polygons_of_only_seps, color=(3,3,3)) text_regions_p_true[:,:][mask_images_only[:,:] == 1] = 2 text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts = polygons_of_only_texts, color=(1,1,1)) @@ -2377,7 +2377,7 @@ class Eynollah: prediction_regions_org2=resize_image(prediction_regions_org2, img_height_h, img_width_h ) mask_zeros2 = (prediction_regions_org2[:,:,0] == 0) - mask_lines2 = (prediction_regions_org2[:,:,0] == 3) + mask_seps2 = (prediction_regions_org2[:,:,0] == 3) text_sume_early = (prediction_regions_org[:,:] == 1).sum() prediction_regions_org_copy = np.copy(prediction_regions_org) prediction_regions_org_copy[(prediction_regions_org_copy[:,:]==1) & (mask_zeros2[:,:]==1)] = 0 @@ -2388,8 +2388,8 @@ class Eynollah: if not(is_image_enhanced and rate_two_models < RATIO_OF_TWO_MODEL_THRESHOLD): prediction_regions_org = np.copy(prediction_regions_org_copy) - prediction_regions_org[(mask_lines2[:,:]==1) & (prediction_regions_org[:,:]==0)]=3 - mask_lines_only=(prediction_regions_org[:,:]==3)*1 + prediction_regions_org[(mask_seps2[:,:]==1) & (prediction_regions_org[:,:]==0)]=3 + mask_seps_only=(prediction_regions_org[:,:]==3)*1 prediction_regions_org = cv2.erode(prediction_regions_org[:,:], KERNEL, iterations=2) prediction_regions_org = cv2.dilate(prediction_regions_org[:,:], KERNEL, iterations=2) @@ -2411,20 +2411,20 @@ class Eynollah: prediction_regions_org = resize_image(prediction_regions_org, img_height_h, img_width_h ) prediction_regions_org=prediction_regions_org[:,:,0] - mask_lines_only=(prediction_regions_org[:,:]==3)*1 + mask_seps_only=(prediction_regions_org[:,:]==3)*1 mask_texts_only=(prediction_regions_org[:,:]==1)*1 mask_images_only=(prediction_regions_org[:,:]==2)*1 - polygons_seplines, hir_seplines = return_contours_of_image(mask_lines_only) + polygons_seplines, hir_seplines = return_contours_of_image(mask_seps_only) polygons_seplines = filter_contours_area_of_image( - mask_lines_only, polygons_seplines, hir_seplines, max_area=1, min_area=0.00001, dilate=1) + mask_seps_only, polygons_seplines, hir_seplines, max_area=1, min_area=0.00001, dilate=1) polygons_of_only_texts = return_contours_of_interested_region(mask_texts_only, 1, 0.00001) - polygons_of_only_lines = return_contours_of_interested_region(mask_lines_only, 1, 0.00001) + polygons_of_only_seps = return_contours_of_interested_region(mask_seps_only, 1, 0.00001) text_regions_p_true = np.zeros(prediction_regions_org.shape) - text_regions_p_true = cv2.fillPoly(text_regions_p_true,pts = polygons_of_only_lines, color=(3, 3, 3)) + text_regions_p_true = cv2.fillPoly(text_regions_p_true,pts = polygons_of_only_seps, color=(3, 3, 3)) text_regions_p_true[:,:][mask_images_only[:,:] == 1] = 2 text_regions_p_true=cv2.fillPoly(text_regions_p_true,pts=polygons_of_only_texts, color=(1,1,1)) @@ -2449,7 +2449,7 @@ class Eynollah: prediction_regions_org = resize_image(prediction_regions_org, img_height_h, img_width_h ) prediction_regions_org=prediction_regions_org[:,:,0] - #mask_lines_only=(prediction_regions_org[:,:]==3)*1 + #mask_seps_only=(prediction_regions_org[:,:]==3)*1 #img = resize_image(img_org, int(img_org.shape[0]*1), int(img_org.shape[1]*1)) #prediction_regions_org = self.do_prediction(True, img, self.models["region"]) @@ -2457,19 +2457,19 @@ class Eynollah: #prediction_regions_org = prediction_regions_org[:,:,0] #prediction_regions_org[(prediction_regions_org[:,:] == 1) & (mask_zeros_y[:,:] == 1)]=0 - mask_lines_only = (prediction_regions_org == 3)*1 + mask_seps_only = (prediction_regions_org == 3)*1 mask_texts_only = (prediction_regions_org == 1)*1 mask_images_only= (prediction_regions_org == 2)*1 - polygons_seplines, hir_seplines = return_contours_of_image(mask_lines_only) + polygons_seplines, hir_seplines = return_contours_of_image(mask_seps_only) polygons_seplines = filter_contours_area_of_image( - mask_lines_only, polygons_seplines, hir_seplines, max_area=1, min_area=0.00001, dilate=1) + mask_seps_only, polygons_seplines, hir_seplines, max_area=1, min_area=0.00001, dilate=1) polygons_of_only_texts = return_contours_of_interested_region(mask_texts_only,1,0.00001) - polygons_of_only_lines = return_contours_of_interested_region(mask_lines_only,1,0.00001) + polygons_of_only_seps = return_contours_of_interested_region(mask_seps_only,1,0.00001) text_regions_p_true = np.zeros(prediction_regions_org.shape) - text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts = polygons_of_only_lines, color=(3,3,3)) + text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts = polygons_of_only_seps, color=(3,3,3)) text_regions_p_true[:,:][mask_images_only[:,:] == 1] = 2 text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts = polygons_of_only_texts, color=(1,1,1)) @@ -2952,8 +2952,8 @@ class Eynollah: mask_images = (text_regions_p_1[:, :] == 2) * 1 mask_images = mask_images.astype(np.uint8) mask_images = cv2.erode(mask_images[:, :], KERNEL, iterations=10) - mask_lines = (text_regions_p_1[:, :] == 3) * 1 - mask_lines = mask_lines.astype(np.uint8) + mask_seps = (text_regions_p_1[:, :] == 3) * 1 + mask_seps = mask_seps.astype(np.uint8) img_only_regions_with_sep = ((text_regions_p_1[:, :] != 3) & (text_regions_p_1[:, :] != 0)) * 1 img_only_regions_with_sep = img_only_regions_with_sep.astype(np.uint8) @@ -2979,7 +2979,7 @@ class Eynollah: self.logger.exception(why) num_col = None #print("inside graphics 3 ", time.time() - t_in_gr) - return (num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, + return (num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_seps, text_regions_p_1, cont_page, table_prediction, textline_mask_tot_ea, img_bin_light) def run_graphics_and_columns_without_layout(self, textline_mask_tot_ea, img_bin_light): @@ -3029,8 +3029,8 @@ class Eynollah: mask_images = (text_regions_p_1[:, :] == 2) * 1 mask_images = mask_images.astype(np.uint8) mask_images = cv2.erode(mask_images[:, :], KERNEL, iterations=10) - mask_lines = (text_regions_p_1[:, :] == 3) * 1 - mask_lines = mask_lines.astype(np.uint8) + mask_seps = (text_regions_p_1[:, :] == 3) * 1 + mask_seps = mask_seps.astype(np.uint8) img_only_regions_with_sep = ((text_regions_p_1[:, :] != 3) & (text_regions_p_1[:, :] != 0)) * 1 img_only_regions_with_sep = img_only_regions_with_sep.astype(np.uint8) @@ -3046,7 +3046,7 @@ class Eynollah: except Exception as why: self.logger.exception(why) num_col = None - return (num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, + return (num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_seps, text_regions_p_1, cont_page, table_prediction) def run_enhancement(self, light_version): @@ -3101,13 +3101,13 @@ class Eynollah: return slope_deskew def run_marginals( - self, textline_mask_tot_ea, mask_images, mask_lines, + self, textline_mask_tot_ea, mask_images, mask_seps, num_col_classifier, slope_deskew, text_regions_p_1, table_prediction): textline_mask_tot = textline_mask_tot_ea[:, :] textline_mask_tot[mask_images[:, :] == 1] = 0 - text_regions_p_1[mask_lines[:, :] == 1] = 3 + text_regions_p_1[mask_seps[:, :] == 1] = 3 text_regions_p = text_regions_p_1[:, :] text_regions_p = np.array(text_regions_p) if num_col_classifier in (1, 2): @@ -3131,12 +3131,12 @@ class Eynollah: self.logger.debug('enter run_boxes_no_full_layout') t_0_box = time.time() if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - _, textline_mask_tot_d, text_regions_p_1_n, table_prediction_n = rotation_not_90_func( + _, textline_mask_tot_d, text_regions_p_d, table_prediction_n = rotation_not_90_func( image_page, textline_mask_tot, text_regions_p, table_prediction, slope_deskew) - text_regions_p_1_n = resize_image(text_regions_p_1_n, text_regions_p.shape[0], text_regions_p.shape[1]) + text_regions_p_d = resize_image(text_regions_p_d, text_regions_p.shape[0], text_regions_p.shape[1]) textline_mask_tot_d = resize_image(textline_mask_tot_d, text_regions_p.shape[0], text_regions_p.shape[1]) table_prediction_n = resize_image(table_prediction_n, text_regions_p.shape[0], text_regions_p.shape[1]) - regions_without_separators_d = (text_regions_p_1_n[:, :] == 1) * 1 + regions_without_separators_d = (text_regions_p_d[:, :] == 1) * 1 if self.tables: regions_without_separators_d[table_prediction_n[:,:] == 1] = 1 regions_without_separators = (text_regions_p[:, :] == 1) * 1 @@ -3146,7 +3146,7 @@ class Eynollah: if self.tables: regions_without_separators[table_prediction ==1 ] = 1 if np.abs(slope_deskew) < SLOPE_THRESHOLD: - text_regions_p_1_n = None + text_regions_p_d = None textline_mask_tot_d = None regions_without_separators_d = None label_seps = 3 @@ -3156,7 +3156,7 @@ class Eynollah: if np.abs(slope_deskew) >= SLOPE_THRESHOLD: _, _, matrix_of_seps_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( - text_regions_p_1_n, num_col_classifier, self.tables, label_seps) + text_regions_p_d, num_col_classifier, self.tables, label_seps) #print(time.time()-t_0_box,'time box in 2') self.logger.info("num_col_classifier: %s", num_col_classifier) @@ -3171,7 +3171,7 @@ class Eynollah: t1 = time.time() if np.abs(slope_deskew) < SLOPE_THRESHOLD: boxes, peaks_neg_tot_tables = return_boxes_of_images_by_order_of_reading_new( - splitter_y_new, regions_without_separators, matrix_of_seps_ch, + splitter_y_new, regions_without_separators, text_regions_p, matrix_of_seps_ch, num_col_classifier, erosion_hurts, self.tables, self.right2left) boxes_d = None self.logger.debug("len(boxes): %s", len(boxes)) @@ -3193,7 +3193,7 @@ class Eynollah: #print(time.time()-t_0_box,'time box in 3.3') else: boxes_d, peaks_neg_tot_tables_d = return_boxes_of_images_by_order_of_reading_new( - splitter_y_new_d, regions_without_separators_d, matrix_of_seps_ch_d, + splitter_y_new_d, regions_without_separators_d, text_regions_p_d, matrix_of_seps_ch_d, num_col_classifier, erosion_hurts, self.tables, self.right2left) boxes = None self.logger.debug("len(boxes): %s", len(boxes_d)) @@ -3202,7 +3202,7 @@ class Eynollah: if self.light_version: pass else: - text_regions_p_tables = np.copy(text_regions_p_1_n) + text_regions_p_tables = np.copy(text_regions_p_d) text_regions_p_tables = np.round(text_regions_p_tables) text_regions_p_tables[(text_regions_p_tables != 3) & (table_prediction_n == 1)] = 10 @@ -3245,22 +3245,22 @@ class Eynollah: else: polygons_of_images = return_contours_of_interested_region(img_revised_tab, 2) - pixel_img = 4 + label_marginalia = 4 min_area_mar = 0.00001 if self.light_version: - marginal_mask = (text_regions_p[:,:]==pixel_img)*1 + marginal_mask = (text_regions_p[:,:]==label_marginalia)*1 marginal_mask = marginal_mask.astype('uint8') marginal_mask = cv2.dilate(marginal_mask, KERNEL, iterations=2) polygons_of_marginals = return_contours_of_interested_region(marginal_mask, 1, min_area_mar) else: - polygons_of_marginals = return_contours_of_interested_region(text_regions_p, pixel_img, min_area_mar) + polygons_of_marginals = return_contours_of_interested_region(text_regions_p, label_marginalia, min_area_mar) - pixel_img = 10 - contours_tables = return_contours_of_interested_region(text_regions_p, pixel_img, min_area_mar) + label_tables = 10 + contours_tables = return_contours_of_interested_region(text_regions_p, label_tables, min_area_mar) #print(time.time()-t_0_box,'time box in 5') self.logger.debug('exit run_boxes_no_full_layout') - return (polygons_of_images, img_revised_tab, text_regions_p_1_n, textline_mask_tot_d, + return (polygons_of_images, img_revised_tab, text_regions_p_d, textline_mask_tot_d, regions_without_separators_d, boxes, boxes_d, polygons_of_marginals, contours_tables) @@ -3276,13 +3276,13 @@ class Eynollah: text_regions_p[:,:][table_prediction[:,:]==1] = 10 img_revised_tab = text_regions_p[:,:] if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - _, textline_mask_tot_d, text_regions_p_1_n, table_prediction_n = \ + _, textline_mask_tot_d, text_regions_p_d, table_prediction_n = \ rotation_not_90_func(image_page, textline_mask_tot, text_regions_p, table_prediction, slope_deskew) - text_regions_p_1_n = resize_image(text_regions_p_1_n, - text_regions_p.shape[0], - text_regions_p.shape[1]) + text_regions_p_d = resize_image(text_regions_p_d, + text_regions_p.shape[0], + text_regions_p.shape[1]) textline_mask_tot_d = resize_image(textline_mask_tot_d, text_regions_p.shape[0], text_regions_p.shape[1]) @@ -3290,10 +3290,10 @@ class Eynollah: text_regions_p.shape[0], text_regions_p.shape[1]) - regions_without_separators_d = (text_regions_p_1_n[:,:] == 1)*1 + regions_without_separators_d = (text_regions_p_d[:,:] == 1)*1 regions_without_separators_d[table_prediction_n[:,:] == 1] = 1 else: - text_regions_p_1_n = None + text_regions_p_d = None textline_mask_tot_d = None regions_without_separators_d = None # regions_without_separators = ( text_regions_p[:,:]==1 | text_regions_p[:,:]==2 )*1 @@ -3303,13 +3303,13 @@ class Eynollah: else: if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - _, textline_mask_tot_d, text_regions_p_1_n, table_prediction_n = \ + _, textline_mask_tot_d, text_regions_p_d, table_prediction_n = \ rotation_not_90_func(image_page, textline_mask_tot, text_regions_p, table_prediction, slope_deskew) - text_regions_p_1_n = resize_image(text_regions_p_1_n, - text_regions_p.shape[0], - text_regions_p.shape[1]) + text_regions_p_d = resize_image(text_regions_p_d, + text_regions_p.shape[0], + text_regions_p.shape[1]) textline_mask_tot_d = resize_image(textline_mask_tot_d, text_regions_p.shape[0], text_regions_p.shape[1]) @@ -3317,10 +3317,10 @@ class Eynollah: text_regions_p.shape[0], text_regions_p.shape[1]) - regions_without_separators_d = (text_regions_p_1_n[:,:] == 1)*1 + regions_without_separators_d = (text_regions_p_d[:,:] == 1)*1 regions_without_separators_d[table_prediction_n[:,:] == 1] = 1 else: - text_regions_p_1_n = None + text_regions_p_d = None textline_mask_tot_d = None regions_without_separators_d = None @@ -3331,12 +3331,12 @@ class Eynollah: label_seps=3 if np.abs(slope_deskew) < SLOPE_THRESHOLD: - num_col, _, matrix_of_lines_ch, splitter_y_new, _ = find_number_of_columns_in_document( + num_col, _, matrix_of_seps_ch, splitter_y_new, _ = find_number_of_columns_in_document( text_regions_p, num_col_classifier, self.tables, label_seps) if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - num_col_d, _, matrix_of_lines_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( - text_regions_p_1_n, num_col_classifier, self.tables, label_seps) + num_col_d, _, matrix_of_seps_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( + text_regions_p_d, num_col_classifier, self.tables, label_seps) if num_col_classifier>=3: if np.abs(slope_deskew) < SLOPE_THRESHOLD: @@ -3351,7 +3351,7 @@ class Eynollah: if np.abs(slope_deskew) < SLOPE_THRESHOLD: boxes, peaks_neg_tot_tables = return_boxes_of_images_by_order_of_reading_new( - splitter_y_new, regions_without_separators, matrix_of_lines_ch, + splitter_y_new, regions_without_separators, text_regions_p, matrix_of_seps_ch, num_col_classifier, erosion_hurts, self.tables, self.right2left) text_regions_p_tables = np.copy(text_regions_p) text_regions_p_tables[:,:][(table_prediction[:,:]==1)] = 10 @@ -3364,9 +3364,9 @@ class Eynollah: img_revised_tab2, table_prediction, 10, num_col_classifier) else: boxes_d, peaks_neg_tot_tables_d = return_boxes_of_images_by_order_of_reading_new( - splitter_y_new_d, regions_without_separators_d, matrix_of_lines_ch_d, + splitter_y_new_d, regions_without_separators_d, text_regions_p_d, matrix_of_seps_ch_d, num_col_classifier, erosion_hurts, self.tables, self.right2left) - text_regions_p_tables = np.copy(text_regions_p_1_n) + text_regions_p_tables = np.copy(text_regions_p_d) text_regions_p_tables = np.round(text_regions_p_tables) text_regions_p_tables[(text_regions_p_tables != 3) & (table_prediction_n == 1)] = 10 @@ -3399,20 +3399,20 @@ class Eynollah: text_regions_p[img_revised_tab == 10] = 10 #img_revised_tab[img_revised_tab2 == 10] = 10 - pixel_img = 4 + label_marginalia = 4 min_area_mar = 0.00001 if self.light_version: - marginal_mask = (text_regions_p[:,:]==pixel_img)*1 + marginal_mask = (text_regions_p[:,:]==label_marginalia)*1 marginal_mask = marginal_mask.astype('uint8') marginal_mask = cv2.dilate(marginal_mask, KERNEL, iterations=2) polygons_of_marginals = return_contours_of_interested_region(marginal_mask, 1, min_area_mar) else: - polygons_of_marginals = return_contours_of_interested_region(text_regions_p, pixel_img, min_area_mar) + polygons_of_marginals = return_contours_of_interested_region(text_regions_p, label_marginalia, min_area_mar) - pixel_img = 10 - contours_tables = return_contours_of_interested_region(text_regions_p, pixel_img, min_area_mar) + label_tables = 10 + contours_tables = return_contours_of_interested_region(text_regions_p, label_tables, min_area_mar) # set first model with second model text_regions_p[:, :][text_regions_p[:, :] == 2] = 5 @@ -3465,16 +3465,16 @@ class Eynollah: #plt.show() ####if not self.tables: if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - _, textline_mask_tot_d, text_regions_p_1_n, regions_fully_n = rotation_not_90_func_full_layout( + _, textline_mask_tot_d, text_regions_p_d, regions_fully_n = rotation_not_90_func_full_layout( image_page, textline_mask_tot, text_regions_p, regions_fully, slope_deskew) - text_regions_p_1_n = resize_image(text_regions_p_1_n, text_regions_p.shape[0], text_regions_p.shape[1]) + text_regions_p_d = resize_image(text_regions_p_d, text_regions_p.shape[0], text_regions_p.shape[1]) textline_mask_tot_d = resize_image(textline_mask_tot_d, text_regions_p.shape[0], text_regions_p.shape[1]) regions_fully_n = resize_image(regions_fully_n, text_regions_p.shape[0], text_regions_p.shape[1]) if not self.tables: - regions_without_separators_d = (text_regions_p_1_n[:, :] == 1) * 1 + regions_without_separators_d = (text_regions_p_d[:, :] == 1) * 1 else: - text_regions_p_1_n = None + text_regions_p_d = None textline_mask_tot_d = None regions_without_separators_d = None if not self.tables: @@ -3484,7 +3484,7 @@ class Eynollah: self.logger.debug('exit run_boxes_full_layout') #print("full inside 3", time.time()- t_full0) - return (polygons_of_images, img_revised_tab, text_regions_p_1_n, textline_mask_tot_d, + return (polygons_of_images, img_revised_tab, text_regions_p_d, textline_mask_tot_d, regions_without_separators_d, regions_fully, regions_without_separators, polygons_of_marginals, contours_tables) @@ -4301,7 +4301,7 @@ class Eynollah: slope_deskew = self.run_deskew(textline_mask_tot_ea) #print("text region early -2,5 in %.1fs", time.time() - t0) #self.logger.info("Textregion detection took %.1fs ", time.time() - t1t) - num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, \ + num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_seps, \ text_regions_p_1, cont_page, table_prediction, textline_mask_tot_ea, img_bin_light = \ self.run_graphics_and_columns_light(text_regions_p_1, textline_mask_tot_ea, num_col_classifier, num_column_is_classified, @@ -4318,7 +4318,7 @@ class Eynollah: confidence_matrix = np.zeros((text_regions_p_1.shape[:2])) t1 = time.time() - num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, \ + num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_seps, \ text_regions_p_1, cont_page, table_prediction = \ self.run_graphics_and_columns(text_regions_p_1, num_col_classifier, num_column_is_classified, erosion_hurts) @@ -4356,12 +4356,12 @@ class Eynollah: image_page = resize_image(image_page,img_h_new, img_w_new ) textline_mask_tot_ea = resize_image(textline_mask_tot_ea,img_h_new, img_w_new ) mask_images = resize_image(mask_images,img_h_new, img_w_new ) - mask_lines = resize_image(mask_lines,img_h_new, img_w_new ) + mask_seps = resize_image(mask_seps, img_h_new, img_w_new) text_regions_p_1 = resize_image(text_regions_p_1,img_h_new, img_w_new ) table_prediction = resize_image(table_prediction,img_h_new, img_w_new ) textline_mask_tot, text_regions_p = \ - self.run_marginals(textline_mask_tot_ea, mask_images, mask_lines, + self.run_marginals(textline_mask_tot_ea, mask_images, mask_seps, num_col_classifier, slope_deskew, text_regions_p_1, table_prediction) if self.plotter: self.plotter.save_plot_of_layout_main_all(text_regions_p, image_page) @@ -4398,14 +4398,14 @@ class Eynollah: ## birdan sora chock chakir t1 = time.time() if not self.full_layout: - polygons_of_images, img_revised_tab, text_regions_p_1_n, \ + polygons_of_images, img_revised_tab, text_regions_p_d, \ textline_mask_tot_d, regions_without_separators_d, \ boxes, boxes_d, polygons_of_marginals, contours_tables = \ self.run_boxes_no_full_layout(image_page, textline_mask_tot, text_regions_p, slope_deskew, num_col_classifier, table_prediction, erosion_hurts) ###polygons_of_marginals = dilate_textregion_contours(polygons_of_marginals) else: - polygons_of_images, img_revised_tab, text_regions_p_1_n, \ + polygons_of_images, img_revised_tab, text_regions_p_d, \ textline_mask_tot_d, regions_without_separators_d, \ regions_fully, regions_without_separators, polygons_of_marginals, contours_tables = \ self.run_boxes_full_layout(image_page, textline_mask_tot, text_regions_p, slope_deskew, @@ -4419,7 +4419,7 @@ class Eynollah: text_only = (img_revised_tab[:, :] == 1) * 1 if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - text_only_d = (text_regions_p_1_n[:, :] == 1) * 1 + text_only_d = ((text_regions_p_d[:, :] == 1)) * 1 #print("text region early 2 in %.1fs", time.time() - t0) ###min_con_area = 0.000005 @@ -4695,18 +4695,18 @@ class Eynollah: label_seps = 6 if not self.headers_off: if np.abs(slope_deskew) < SLOPE_THRESHOLD: - num_col, _, matrix_of_lines_ch, splitter_y_new, _ = find_number_of_columns_in_document( + num_col, _, matrix_of_seps_ch, splitter_y_new, _ = find_number_of_columns_in_document( text_regions_p, num_col_classifier, self.tables, label_seps, contours_only_text_parent_h) else: - _, _, matrix_of_lines_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( - text_regions_p_1_n, num_col_classifier, self.tables, label_seps, contours_only_text_parent_h_d_ordered) + _, _, matrix_of_seps_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( + text_regions_p_d, num_col_classifier, self.tables, label_seps, contours_only_text_parent_h_d_ordered) elif self.headers_off: if np.abs(slope_deskew) < SLOPE_THRESHOLD: - num_col, _, matrix_of_lines_ch, splitter_y_new, _ = find_number_of_columns_in_document( + num_col, _, matrix_of_seps_ch, splitter_y_new, _ = find_number_of_columns_in_document( text_regions_p, num_col_classifier, self.tables, label_seps) else: - _, _, matrix_of_lines_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( - text_regions_p_1_n, num_col_classifier, self.tables, label_seps) + _, _, matrix_of_seps_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( + text_regions_p_d, num_col_classifier, self.tables, label_seps) if num_col_classifier >= 3: if np.abs(slope_deskew) < SLOPE_THRESHOLD: @@ -4718,12 +4718,12 @@ class Eynollah: if np.abs(slope_deskew) < SLOPE_THRESHOLD: boxes, _ = return_boxes_of_images_by_order_of_reading_new( - splitter_y_new, regions_without_separators, matrix_of_lines_ch, + splitter_y_new, regions_without_separators, text_regions_p, matrix_of_seps_ch, num_col_classifier, erosion_hurts, self.tables, self.right2left, logger=self.logger) else: boxes_d, _ = return_boxes_of_images_by_order_of_reading_new( - splitter_y_new_d, regions_without_separators_d, matrix_of_lines_ch_d, + splitter_y_new_d, regions_without_separators_d, text_regions_p_d, matrix_of_seps_ch_d, num_col_classifier, erosion_hurts, self.tables, self.right2left, logger=self.logger) else: From 4abc2ff57249e634c70cda665abc5d99429595d2 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 14 Nov 2025 03:05:02 +0100 Subject: [PATCH 024/118] rewrite/simplify manual reading order using recursive algorithm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - rename `return_x_start_end_mothers_childs_and_type_of_reading_order` → `return_multicol_separators_x_start_end`, and drop all the analysis pertaining to mother/child relationships and full-span separators, also drop the separator unification rules; instead of the latter, try to combine neighbouring separators more generally: join column spans iff there is nothing in between (which also necessitates passing the region mask), and keep only one of every such redundant pair; add the top (of each page part) as full-span separator up front, and return separators already ordered by y - `return_boxes_of_images_by_order_of_reading_new`: - also pass regions with separators, so they do not have to be reconstructed from the separator coordinates, and also contain images and other non-text region types, when trying to elongate separators to maximize their span (without introducing overlaps) - determine connected components of the region mask, i.e. labels and their respective bboxes, in order to 1. gain additional multi-column separators, if possible 2. avoid cutting through regions which do cross column boundaries later on - whenever adding a new bbox, first look up the label map to see if there are any multi-column regions extending to the right of the current column; if there are, then advance not just one column to the right, but as many as necessary to avoid cutting through these regions - new core algorithm: iterate separators sorted by y and then column by column, but whenever the next separator ends in the same column as the current one or even further left, recurse (i.e. finish that span first before continuing with the top iteration) --- src/eynollah/utils/__init__.py | 935 ++++++++++----------------------- 1 file changed, 277 insertions(+), 658 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index f3dbae2..e00004f 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -32,289 +32,132 @@ def pairwise(iterable): yield a, b a = b -def return_x_start_end_mothers_childs_and_type_of_reading_order( - peak_points, x_min_hor_some, x_max_hor_some, cy_hor_some, y_max_hor_some): +def return_multicol_separators_x_start_end( + regions_without_separators, peak_points, top, bot, + x_min_hor_some, x_max_hor_some, cy_hor_some, y_min_hor_some, y_max_hor_some): """ Analyse which separators overlap multiple column candidates, and how they overlap each other. Ignore separators not spanning multiple columns. - For the separators to be returned, try to join them when they are directly - adjacent horizontally but nearby vertically (and thus mutually compatible). - Also, mark any separators that already span the full width. - - Furthermore, identify which pairs of (unjoined) separators span subsets of columns - of each other (disregarding vertical positions). Referring, respectively, to the - superset separators as "mothers" and to the subset separators as "children", - retrieve information on which columns are spanned by separators with no mother, - and which columns are spanned by their children (if any). - - Moreover, determine if there is any (column) overlap among the multi-span separators - with no mother, specifically (and thus, no simple box separation is possible). + For the separators to be returned, try to remove or unify them when there + is no region between them (vertically) and their neighbours. Arguments: + * the text mask (with all separators suppressed) * the x column coordinates - * the x start column index of the raw separators - * the x end column index of the raw separators - * the y center coordinate of the raw separators - * the y end coordinate of the raw separators + * the y start coordinate to consider in total + * the y end coordinate to consider in total + * the x start coordinate of the horizontal separators + * the x end coordinate of the horizontal separators + * the y start coordinate of the horizontal separators + * the y center coordinate of the horizontal separators + * the y end coordinate of the horizontal separators Returns: a tuple of: - * whether any top-level (no-mother) multi-span separators overlap each other * the x start column index of the resulting multi-span separators * the x end column index of the resulting multi-span separators + * the y start coordinate of the resulting multi-span separators * the y center coordinate of the resulting multi-span separators * the y end coordinate of the resulting multi-span separators - * the y center (for 1 representative) of the top-level (no-mother) multi-span separators - * the x start column index of the top-level (no-mother) multi-span separators - * the x end column index of the top-level (no-mother) multi-span separators - * whether any multi-span separators have super-spans of other (child) multi-span separators - * the y center (for 1 representative) of the top-level (no-mother) multi-span separators - which have super-spans of other (child) multi-span separators - * the x start column index of the top-level multi-span separators - which have super-spans of other (child) multi-span separators - * the x end column index of the top-level multi-span separators - which have super-spans of other (child) multi-span separators - * indexes of multi-span separators with full-width span """ - x_start=[] - x_end=[] - len_sep=[] - y_mid=[] - y_max=[] - new_main_sep_y=[] - indexer=0 + x_start = [0] + x_end = [len(peak_points) - 1] + y_min = [top] + y_mid = [top] + y_max = [top + 2] + indexer = 1 for i in range(len(x_min_hor_some)): #print(indexer, "%d:%d" % (x_min_hor_some[i], x_max_hor_some[i]), cy_hor_some[i]) starting = x_min_hor_some[i] - peak_points min_start = np.flatnonzero(starting >= 0)[-1] # last left-of ending = x_max_hor_some[i] - peak_points - max_end = np.flatnonzero(ending < 0)[0] # first right-of + max_end = np.flatnonzero(ending <= 0)[0] # first right-of #print(indexer, "%d:%d" % (min_start, max_end)) if (max_end-min_start)>=2: # column range of separator spans more than one column candidate - if (max_end-min_start)==(len(peak_points)-1): - # all columns (i.e. could be true new y splitter) - new_main_sep_y.append(indexer) - #print((max_end-min_start),len(peak_points),'(max_end-min_start)') + y_min.append(y_min_hor_some[i]) y_mid.append(cy_hor_some[i]) y_max.append(y_max_hor_some[i]) x_end.append(max_end) x_start.append(min_start) - len_sep.append(max_end-min_start) indexer+=1 #print(x_start,'x_start') #print(x_end,'x_end') - x_start_returned = np.array(x_start, dtype=int) - x_end_returned = np.array(x_end, dtype=int) - y_mid_returned = np.array(y_mid, dtype=int) - y_max_returned = np.array(y_max, dtype=int) - #print(y_mid_returned,'y_mid_returned') - #print(x_start_returned,'x_start_returned') - #print(x_end_returned,'x_end_returned') - - # join/elongate separators if follow-up x and similar y - sep_pairs = contours_in_same_horizon(y_mid_returned) - if len(sep_pairs): - #print('burda') - args_to_be_unified = set() - y_mid_unified = [] - y_max_unified = [] - x_start_unified = [] - x_end_unified = [] - for pair in sep_pairs: - if (not np.array_equal(*x_start_returned[pair]) and - not np.array_equal(*x_end_returned[pair]) and - # immediately adjacent columns? - np.diff(x_end_returned[pair] - - x_start_returned[pair])[0] in [1, -1]): - - args_to_be_unified.union(set(pair)) - y_mid_unified.append(np.min(y_mid_returned[pair])) - y_max_unified.append(np.max(y_max_returned[pair])) - x_start_unified.append(np.min(x_start_returned[pair])) - x_end_unified.append(np.max(x_end_returned[pair])) - #print(pair,'pair') - #print(x_start_returned[pair],'x_s_same_hor') - #print(x_end_returned[pair],'x_e_same_hor') - #print(y_mid_unified,'y_mid_unified') - #print(y_max_unified,'y_max_unified') - #print(x_start_unified,'x_s_unified') - #print(x_end_unified,'x_e_selected') - #print('#############################') - - if len(y_mid_unified): - args_lines_not_unified = np.setdiff1d(np.arange(len(y_mid_returned)), - list(args_to_be_unified), assume_unique=True) - #print(args_lines_not_unified,'args_lines_not_unified') - x_start_returned = np.append(x_start_returned[args_lines_not_unified], - x_start_unified, axis=0) - x_end_returned = np.append(x_end_returned[args_lines_not_unified], - x_end_unified, axis=0) - y_mid_returned = np.append(y_mid_returned[args_lines_not_unified], - y_mid_unified, axis=0) - y_max_returned = np.append(y_max_returned[args_lines_not_unified], - y_max_unified, axis=0) - #print(y_mid_returned,'y_mid_returned2') - #print(x_start_returned,'x_start_returned2') - #print(x_end_returned,'x_end_returned2') - - #print(new_main_sep_y,'new_main_sep_y') - #print(x_start,'x_start') - #print(x_end,'x_end') - x_start = np.array(x_start) - x_end = np.array(x_end) - y_mid = np.array(y_mid) - if len(new_main_sep_y): - # some full-width multi-span separators exist, so - # restrict the y range of separators to search for - # mutual overlaps to only those within the largest - # y strip between adjacent multi-span separators - # that involve at least one such full-width seps. - # (does not affect the separators to be returned) - min_ys=np.min(y_mid) - max_ys=np.max(y_mid) - #print(min_ys,'min_ys') - #print(max_ys,'max_ys') - - y_mains0 = list(y_mid[new_main_sep_y]) - y_mains = [min_ys] + y_mains0 + [max_ys] - - y_mains = np.sort(y_mains) - argm = np.argmax(np.diff(y_mains)) - y_mid_new = y_mains[argm] - y_mid_next_new = y_mains[argm + 1] - - #print(y_mid_new,argm,'y_mid_new') - #print(y_mid_next_new,argm+1,'y_mid_next_new') - #print(y_mid[new_main_sep_y],new_main_sep_y,'yseps') - x_start=np.array(x_start) - x_end=np.array(x_end) - y_mid=np.array(y_mid) - # iff either boundary is itself not a full-width separator, - # then include it in the range of separators to be kept - if y_mid_new in y_mains0: - where = y_mid > y_mid_new - else: - where = y_mid >= y_mid_new - if y_mid_next_new in y_mains0: - where &= y_mid < y_mid_next_new - else: - where &= y_mid <= y_mid_next_new - x_start = x_start[where] - x_end = x_end[where] - y_mid = y_mid[where] + x_start = np.array(x_start, dtype=int) + x_end = np.array(x_end, dtype=int) + y_min = np.array(y_min, dtype=int) + y_mid = np.array(y_mid, dtype=int) + y_max = np.array(y_max, dtype=int) + #print(y_mid,'y_mid') #print(x_start,'x_start') #print(x_end,'x_end') - # remove redundant separators that span the same columns - # (keeping only 1 representative each) - deleted = set() - for index_i in range(len(x_start) - 1): - nodes_i = set(range(x_start[index_i], x_end[index_i] + 1)) - #print(nodes_i, "nodes_i") - for index_j in range(index_i + 1, len(x_start)): - nodes_j = set(range(x_start[index_j], x_end[index_j] + 1)) - #print(nodes_j, "nodes_j") - if nodes_i == nodes_j: - deleted.add(index_j) - #print(deleted,"deleted") - remained_sep_indexes = set(range(len(x_start))) - deleted - #print(remained_sep_indexes,'remained_sep_indexes') + # remove redundant separators (with nothing in between) + args_emptysep = set() + args_ysorted = np.argsort(y_mid) + for i in range(len(y_mid)): + # find nearest neighbours above with nothing in between + prev = (~np.eye(len(y_mid), dtype=bool)[i] & + (y_mid[i] >= y_mid) & + # complete subsumption: + # (x_start[i] >= x_start) & + # (x_end[i] <= x_end) + # partial overlap + (x_start[i] < x_end) & + (x_end[i] > x_start) + ) + prev[list(args_emptysep)] = False # but no pair we already saw + if not prev.any(): + continue + prev = np.flatnonzero(prev[args_ysorted]) + j = args_ysorted[prev[-1]] + if not np.any(regions_without_separators[y_max[j]: y_min[i], + peak_points[min(x_start[i], x_start[j])]: + peak_points[max(x_end[i], x_end[j])]]): + args_emptysep.add(i) + if x_start[j] > x_start[i]: + # print(j, "now starts at", x_start[i]) + x_start[j] = x_start[i] + if x_end[j] < x_end[i]: + x_end[j] = x_end[i] + # print(j, "now ends at", x_end[i]) + # print(j, i, "%d:%d" % (y_mid[j], y_mid[i]), "%d:%d" % (x_start[i], x_end[i]), "empty prev sep") + continue + # find nearest neighbours below with nothing in between + nExt = (~np.eye(len(y_mid), dtype=bool)[i] & + (y_mid[i] <= y_mid) & + (x_start[i] >= x_start) & + (x_end[i] <= x_end)) + nExt[list(args_emptysep)] = False # but no pair we already saw + if not nExt.any(): + continue + nExt = np.flatnonzero(nExt[args_ysorted]) + j = args_ysorted[nExt[0]] + if not np.any(regions_without_separators[y_max[i]: y_min[j], + peak_points[x_start[i]]: + peak_points[x_end[i]]]): + args_emptysep.add(i) + # print(j, i, "%d:%d" % (y_mid[j], y_mid[i]), "%d:%d" % (x_start[i], x_end[i]), "empty next sep") + args_to_be_kept = [arg for arg in args_ysorted + if not arg in args_emptysep] + x_start = x_start[args_to_be_kept] + x_end = x_end[args_to_be_kept] + y_min = y_min[args_to_be_kept] + y_mid = y_mid[args_to_be_kept] + y_max = y_max[args_to_be_kept] - # determine which separators span which columns - mother = [] # whether the respective separator has a mother separator - child = [] # whether the respective separator has a child separator - for index_i in remained_sep_indexes: - have_mother=0 - have_child=0 - nodes_i = set(range(x_start[index_i], x_end[index_i] + 1)) - for index_j in remained_sep_indexes: - nodes_j = set(range(x_start[index_j], x_end[index_j] + 1)) - if nodes_i < nodes_j: - have_mother=1 - if nodes_i > nodes_j: - have_child=1 - mother.append(have_mother) - child.append(have_child) - #print(mother, "mother") - #print(child, "child") - - mother = np.array(mother) - child = np.array(child) - #print(mother,'mother') - #print(child,'child') - remained_sep_indexes = np.array(list(remained_sep_indexes)) - #print(len(remained_sep_indexes)) - #print(len(remained_sep_indexes),len(x_start),len(x_end),len(y_mid),'lens') - - reading_order_type = 0 - if len(remained_sep_indexes): - #print(np.array(remained_sep_indexes),'np.array(remained_sep_indexes)') - #print(np.array(mother),'mother') - remained_sep_indexes_without_mother = remained_sep_indexes[mother==0] - remained_sep_indexes_with_child_without_mother = remained_sep_indexes[(mother==0) & (child==1)] - #print(remained_sep_indexes_without_mother,'remained_sep_indexes_without_mother') - #print(remained_sep_indexes_without_mother,'remained_sep_indexes_without_mother') - - x_end_with_child_without_mother = x_end[remained_sep_indexes_with_child_without_mother] - x_start_with_child_without_mother = x_start[remained_sep_indexes_with_child_without_mother] - y_mid_with_child_without_mother = y_mid[remained_sep_indexes_with_child_without_mother] - - x_end_without_mother = x_end[remained_sep_indexes_without_mother] - x_start_without_mother = x_start[remained_sep_indexes_without_mother] - y_mid_without_mother = y_mid[remained_sep_indexes_without_mother] - - if len(remained_sep_indexes_without_mother)>=2: - for i in range(len(remained_sep_indexes_without_mother)-1): - index_i = remained_sep_indexes_without_mother[i] - nodes_i = set(range(x_start[index_i], x_end[index_i] + 1)) - #print(index_i, nodes_i, "nodes_i without mother") - for j in range(i + 1, len(remained_sep_indexes_without_mother)): - index_j = remained_sep_indexes_without_mother[j] - nodes_j = set(range(x_start[index_j], x_end[index_j] + 1)) - #print(index_j, nodes_j, "nodes_j without mother") - if nodes_i - nodes_j != nodes_i: - #print("type=1") - reading_order_type = 1 - else: - y_mid_without_mother = np.zeros(0, int) - x_start_without_mother = np.zeros(0, int) - x_end_without_mother = np.zeros(0, int) - y_mid_with_child_without_mother = np.zeros(0, int) - x_start_with_child_without_mother = np.zeros(0, int) - x_end_with_child_without_mother = np.zeros(0, int) - - #print(reading_order_type,'reading_order_type') - #print(y_mid_with_child_without_mother,'y_mid_with_child_without_mother') - #print(x_start_with_child_without_mother,'x_start_with_child_without_mother') - #print(x_end_with_child_without_mother,'x_end_with_hild_without_mother') - - len_sep_with_child = len(child[child==1]) - #print(len_sep_with_child,'len_sep_with_child') - there_is_sep_with_child = 0 - if len_sep_with_child >= 1: - there_is_sep_with_child = 1 - - return (reading_order_type, - x_start_returned, - x_end_returned, - y_mid_returned, - y_max_returned, - y_mid_without_mother, - x_start_without_mother, - x_end_without_mother, - there_is_sep_with_child, - y_mid_with_child_without_mother, - x_start_with_child_without_mother, - x_end_with_child_without_mother, - new_main_sep_y) + return (x_start, + x_end, + y_min, + y_mid, + y_max) def box2rect(box: Tuple[int, int, int, int]) -> Tuple[int, int, int, int]: return (box[1], box[1] + box[3], @@ -1212,6 +1055,25 @@ def small_textlines_to_parent_adherence2(textlines_con, textline_iamge, num_col) return textlines_con_changed def order_of_regions(textline_mask, contours_main, contours_head, y_ref, x_ref): + """ + Order text region contours within a single column bbox in a top-down-left-right way. + + First, determine the vertical gaps. Then iterate over each vertical segment, + identifying the contours centered in that segment. Order them by their + horizontal center, and add them to the overall order. + + Arguments: + * textline_mask: the mask of the textline segmentation, cropped for that box + * contours_main: the paragraph text region contours expected to be here + * contours_head: the heading text region contours expected to be here + * y_ref: the vertical offset of that box within the page + * x_ref: the horizontal offset of that box within the page + + Returns: a tuple of + * the array of contour indexes overall within this box (i.e. into main+head) + * the array of types (1 for paragraph, 2 for heading) + * the array of contour indexes for the respective type (i.e. into contours_main or contours_head) + """ ##plt.imshow(textline_mask) ##plt.show() y = textline_mask.sum(axis=1) # horizontal projection profile @@ -1547,7 +1409,7 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, try: num_col, peaks_neg_fin = find_num_col(regions_without_separators[top: bot], num_col_classifier, tables, multiplier=7.0) - #print("big part %d:%d has %d columns" % (top, bot, num_col), peaks_neg_fin) + # print("big part %d:%d has %d columns" % (top, bot, num_col + 1), peaks_neg_fin) except: num_col = 0 peaks_neg_fin = [] @@ -1564,11 +1426,36 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, return num_col_fin, peaks_neg_fin_fin, matrix_of_seps_ch, splitter_y_new, separators_closeup_n def return_boxes_of_images_by_order_of_reading_new( - splitter_y_new, regions_without_separators, - matrix_of_lines_ch, + splitter_y_new, + regions_without_separators, + regions_with_separators, + matrix_of_seps_ch, num_col_classifier, erosion_hurts, tables, right2left_readingorder, logger=None): + """ + Iterate through the vertical parts of a page, each with its own set of columns, + and from the matrix of horizontal separators for that part, find an ordered + list of bounding boxes through all columns and regions. + + Arguments: + * splitter_y_new: the y coordinates separating the parts + * regions_without_separators: (text) region mask with separators suppressed; + (needed to find per-part columns and to combine separators if possible) + * regions_with_separators: (full) region map with separators suppressed; + (needed to elongate separators if possible) + * matrix_of_seps: type and coordinates of horizontal and vertical separators, + as well as headings + * num_col_classifier: predicted number of columns for the entire page + * erosion_hurts: bool + * tables: bool + * right2left_readingorder: whether to invert the default left-to-right order + + Returns: a tuple of + * the ordered list of bounding boxes + * a list of arrays: the x coordinates delimiting the columns for every page part + (according to splitter) + """ if right2left_readingorder: regions_without_separators = cv2.flip(regions_without_separators,1) @@ -1576,12 +1463,20 @@ def return_boxes_of_images_by_order_of_reading_new( logger = getLogger(__package__) logger.debug('enter return_boxes_of_images_by_order_of_reading_new') + # def dbg_imshow(box, title): + # xmin, xmax, ymin, ymax = box + # plt.imshow(regions_with_separators) #, extent=[0, width_tot, bot, top]) + # plt.gca().add_patch(patches.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, + # fill=False, linewidth=1, edgecolor='r')) + # plt.title(title + " at %d:%d, %d:%d" % (ymin, ymax, xmin, xmax)) + # plt.show() # def dbg_plt(box=None, title=None, rectangles=None, rectangles_showidx=False): # minx, maxx, miny, maxy = box or (0, None, 0, None) # img = regions_without_separators[miny:maxy, minx:maxx] # plt.imshow(img) - # xrange = np.arange(0, img.shape[1], 100) - # yrange = np.arange(0, img.shape[0], 100) + # step = max(img.shape) // 10 + # xrange = np.arange(0, img.shape[1], step) + # yrange = np.arange(0, img.shape[0], step) # ax = plt.gca() # ax.set_xticks(xrange) # ax.set_yticks(yrange) @@ -1597,7 +1492,7 @@ def return_boxes_of_images_by_order_of_reading_new( # ax.add_patch(patches.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, # fill=False, linewidth=1, edgecolor='r')) # if rectangles_showidx: - # ax.text((xmin+xmax)/2, (ymin+ymax)/2, str(i + 1), c='r') + # ax.text((xmin+xmax)/2, (ymin+ymax)/2, str(i), c='r') # plt.show() # dbg_plt(title="return_boxes_of_images_by_order_of_reading_new") @@ -1606,11 +1501,12 @@ def return_boxes_of_images_by_order_of_reading_new( splitter_y_new = np.array(splitter_y_new, dtype=int) height_tot, width_tot = regions_without_separators.shape big_part = 22 * height_tot // 100 # percent height + _, ccomps, cstats, _ = cv2.connectedComponentsWithStats(regions_without_separators.astype(np.uint8)) for top, bot in pairwise(splitter_y_new): # print("%d:%d" % (top, bot), 'i') # dbg_plt([0, None, top, bot], "image cut for y split %d:%d" % (top, bot)) - matrix_new = matrix_of_lines_ch[(matrix_of_lines_ch[:,6] > top) & - (matrix_of_lines_ch[:,7] < bot)] + matrix_new = matrix_of_seps_ch[(matrix_of_seps_ch[:,6] >= top) & + (matrix_of_seps_ch[:,7] < bot)] #print(len( matrix_new[:,9][matrix_new[:,9]==1] )) #print(matrix_new[:,8][matrix_new[:,9]==1],'gaddaaa') # check to see is there any vertical separator to find holes. @@ -1698,19 +1594,9 @@ def return_boxes_of_images_by_order_of_reading_new( # elongate horizontal separators+headings as much as possible without overlap args_nonver = matrix_new[:, 9] != 1 - regions_with_separators = np.copy(regions_without_separators[top:bot]) - for xmin, xmax, ymin, ymax in matrix_new[:, [2, 3, 6, 7]]: - regions_with_separators[ymin - top: ymax - top, xmin: xmax] = 6 - # def dbg_imshow(box, title): - # xmin, xmax, ymin, ymax = box - # plt.imshow(regions_with_separators, extent=[0, width_tot, bot, top]) - # plt.gca().add_patch(patches.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, - # fill=False, linewidth=1, edgecolor='r')) - # plt.title(title + " at %d:%d, %d:%d" % (ymin, ymax, xmin, xmax)) - # plt.show() for i in np.flatnonzero(args_nonver): xmin, xmax, ymin, ymax, typ = matrix_new[i, [2, 3, 6, 7, 9]] - cut = regions_with_separators[ymin - top: ymax - top] + cut = regions_with_separators[ymin: ymax] # dbg_imshow([xmin, xmax, ymin, ymax], "separator %d (%s)" % (i, "heading" if typ else "horizontal")) starting = xmin - peaks_neg_tot min_start = np.flatnonzero(starting >= 0)[-1] # last left-of @@ -1737,6 +1623,7 @@ def return_boxes_of_images_by_order_of_reading_new( args_hor = matrix_new[:, 9] == 0 x_min_hor_some = matrix_new[:, 2][args_hor] x_max_hor_some = matrix_new[:, 3][args_hor] + y_min_hor_some = matrix_new[:, 6][args_hor] y_max_hor_some = matrix_new[:, 7][args_hor] cy_hor_some = matrix_new[:, 5][args_hor] @@ -1752,412 +1639,144 @@ def return_boxes_of_images_by_order_of_reading_new( # (x +/- 30px to avoid crossing col peaks by accident) x_min_hor_some = np.append(x_min_hor_some, np.tile(x_min_hor_head + 30, 2)) x_max_hor_some = np.append(x_max_hor_some, np.tile(x_max_hor_head - 30, 2)) + y_min_hor_some = np.append(y_min_hor_some, # toplines + np.concatenate((y_min_hor_head - 2, + y_max_hor_head - 0))) y_max_hor_some = np.append(y_max_hor_some, # baselines - np.concatenate((y_min_hor_head + 2, + np.concatenate((y_min_hor_head + 0, y_max_hor_head + 2))) - cy_hor_some = np.append(cy_hor_some, # toplines - np.concatenate((y_min_hor_head - 2, - y_max_hor_head - 2))) + cy_hor_some = np.append(cy_hor_some, # centerlines + np.concatenate((y_min_hor_head - 1, + y_max_hor_head + 1))) + + # analyse connected components of regions to gain additional separators + # and prepare a map for cross-column boxes + ccounts = np.bincount(ccomps[top: bot].flatten()) + col_ccounts = np.stack([np.bincount(ccomps[top: bot, left: right].flatten(), + minlength=ccounts.size) + for left, right in pairwise(peaks_neg_tot)]) + labelcolmap = dict() + for label, label_count in enumerate(ccounts): + if not label: + continue + label_left, label_top, label_width, label_height, label_area = cstats[label] + # if label_count < 0.9 * label_area: + # # mostly not in this part of the page + # continue + if label_count < 0.01 * (top - bot) * width_tot: + continue + #assert np.sum(col_ccounts[:, label]) == label_count + label_right = label_left + label_width + label_bot = label_top + label_height + label_start = np.flatnonzero(peaks_neg_tot > label_left)[0] - 1 + label_end = np.flatnonzero(peaks_neg_tot >= label_right)[0] + # store as dict for multi-column boxes: + for start in range(label_start, label_end): + labelcolmap.setdefault(start, list()).append( + (label_end, label_top, label_bot, sum(col_ccounts[start: label_end, label]))) + # make additional separators: + if label_end - label_start < 2: + continue + if np.count_nonzero(col_ccounts[:, label] > 0.1 * label_count) < 2: + continue + x_min_hor_some = np.append(x_min_hor_some, [label_left] * 2) + x_max_hor_some = np.append(x_max_hor_some, [label_right] * 2) + y_min_hor_some = np.append(y_min_hor_some, [label_top - 2, label_bot]) + y_max_hor_some = np.append(y_max_hor_some, [label_top, label_bot + 2]) + cy_hor_some = np.append(cy_hor_some, [label_top - 1, label_bot + 1]) if right2left_readingorder: x_max_hor_some = width_tot - x_min_hor_some x_min_hor_some = width_tot - x_max_hor_some - - reading_order_type, x_starting, x_ending, y_mid, y_max, \ - y_mid_without_mother, x_start_without_mother, x_end_without_mother, \ - there_is_sep_with_child, \ - y_mid_with_child_without_mother, x_start_with_child_without_mother, x_end_with_child_without_mother, \ - new_main_sep_y = return_x_start_end_mothers_childs_and_type_of_reading_order( - peaks_neg_tot, x_min_hor_some, x_max_hor_some, cy_hor_some, y_max_hor_some) - - # show multi-column separators - # dbg_plt([0, None, top, bot], "multi-column separators in current split", + x_starting, x_ending, y_min, y_mid, y_max = return_multicol_separators_x_start_end( + regions_without_separators, peaks_neg_tot, top, bot, + x_min_hor_some, x_max_hor_some, cy_hor_some, y_min_hor_some, y_max_hor_some) + # dbg_plt([0, None, top, bot], "non-empty multi-column separators in current split", # list(zip(peaks_neg_tot[x_starting], peaks_neg_tot[x_ending], - # y_mid - top, y_max - top)), True) + # y_min - top, y_max - top)), True) - if (reading_order_type == 1 or - len(y_mid_without_mother) >= 2 or - there_is_sep_with_child == 1): - # there are top-level multi-colspan horizontal separators which overlap each other - # or multiple top-level multi-colspan horizontal separators - # or multi-colspan horizontal separators shorter than their respective top-level: - # todo: explain how this is dealt with - try: - y_grenze = top + 300 - up = (y_mid > top) & (y_mid <= y_grenze) - - args_early_ys=np.arange(len(y_mid)) - #print(args_early_ys,'args_early_ys') - #print(y_mid,'y_mid') - - x_starting_up = x_starting[up] - x_ending_up = x_ending[up] - y_mid_up = y_mid[up] - y_max_up = y_max[up] - args_up = args_early_ys[up] - #print(args_up,'args_up') - #print(y_mid_up,'y_mid_up') - #check if there is a big separator in this y_mains0 - if len(y_mid_up) > 0: - # is there a separator with full-width span? - main_separator = (x_starting_up == 0) & (x_ending_up == len(peaks_neg_tot) - 1) - y_mid_main_separator_up = y_mid_up[main_separator] - y_max_main_separator_up = y_max_up[main_separator] - args_main_to_deleted = args_up[main_separator] - #print(y_mid_main_separator_up,y_max_main_separator_up,args_main_to_deleted,'fffffjammmm') - if len(y_max_main_separator_up): - args_to_be_kept = np.array(list( set(args_early_ys) - set(args_main_to_deleted) )) - #print(args_to_be_kept,'args_to_be_kept') - boxes.append([0, peaks_neg_tot[-1], - top, y_max_main_separator_up.max()]) - # dbg_plt(boxes[-1], "near top main separator box") - top = y_max_main_separator_up.max() - - #print(top,'top') - y_mid = y_mid[args_to_be_kept] - x_starting = x_starting[args_to_be_kept] - x_ending = x_ending[args_to_be_kept] - y_max = y_max[args_to_be_kept] - - #print('galdiha') - y_grenze = top + 200 - up = (y_mid > top) & (y_mid <= y_grenze) - args_early_ys2 = np.arange(len(y_mid)) - x_starting_up = x_starting[up] - x_ending_up = x_ending[up] - y_mid_up = y_mid[up] - y_max_up = y_max[up] - args_up2 = args_early_ys2[up] - #print(y_mid_up,x_starting_up,x_ending_up,'didid') - else: - args_early_ys2 = args_early_ys - args_up2 = args_up - - nodes_in = set() - for ij in range(len(x_starting_up)): - nodes_in.update(range(x_starting_up[ij], - x_ending_up[ij])) - #print(nodes_in,'nodes_in') - #print(np.array(range(len(peaks_neg_tot)-1)),'np.array(range(len(peaks_neg_tot)-1))') - - if nodes_in == set(range(len(peaks_neg_tot)-1)): - pass - elif nodes_in == set(range(1, len(peaks_neg_tot)-1)): - pass - else: - #print('burdaydikh') - args_to_be_kept2 = np.array(list( set(args_early_ys2) - set(args_up2) )) - - if len(args_to_be_kept2): - #print(args_to_be_kept2, "args_to_be_kept2") - y_mid = y_mid[args_to_be_kept2] - x_starting = x_starting[args_to_be_kept2] - x_ending = x_ending[args_to_be_kept2] - y_max = y_max[args_to_be_kept2] - - #int(top) - # order multi-column separators - y_mid_by_order=[] - x_start_by_order=[] - x_end_by_order=[] - if (reading_order_type == 1 or - len(x_end_with_child_without_mother) == 0): - if reading_order_type == 1: - # there are top-level multi-colspan horizontal separators which overlap each other - #print("adding all columns at top because of multiple overlapping mothers") - y_mid_by_order.append(top) - x_start_by_order.append(0) - x_end_by_order.append(len(peaks_neg_tot)-2) - else: - # there are no top-level multi-colspan horizontal separators which themselves - # contain shorter multi-colspan separators - #print(x_start_without_mother,x_end_without_mother,peaks_neg_tot,'dodo') - columns_covered_by_mothers = set() - for dj in range(len(x_start_without_mother)): - columns_covered_by_mothers.update( - range(x_start_without_mother[dj], - x_end_without_mother[dj])) - columns_not_covered = list(all_columns - columns_covered_by_mothers) - #print(columns_covered_by_mothers, "columns_covered_by_mothers") - #print(columns_not_covered, "columns_not_covered") - y_mid = np.append(y_mid, np.ones(len(columns_not_covered) + - len(x_start_without_mother), - dtype=int) * top) - ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered)) - ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) - x_starting = np.append(x_starting, np.array(columns_not_covered, int)) - x_starting = np.append(x_starting, x_start_without_mother) - x_ending = np.append(x_ending, np.array(columns_not_covered, int) + 1) - x_ending = np.append(x_ending, x_end_without_mother) - - ind_args=np.arange(len(y_mid)) - #print(ind_args,'ind_args') - for column in range(len(peaks_neg_tot)-1): - #print(column,'column') - ind_args_in_col=ind_args[x_starting==column] - #print('babali2') - #print(ind_args_in_col,'ind_args_in_col') - #print(len(y_mid)) - y_mid_column=y_mid[ind_args_in_col] - x_start_column=x_starting[ind_args_in_col] - x_end_column=x_ending[ind_args_in_col] - #print('babali3') - ind_args_col_sorted=np.argsort(y_mid_column) - y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) - x_start_by_order.extend(x_start_column[ind_args_col_sorted]) - x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) - else: - #print(x_start_without_mother,x_end_without_mother,peaks_neg_tot,'dodo') - columns_covered_by_mothers = set() - for dj in range(len(x_start_without_mother)): - columns_covered_by_mothers.update( - range(x_start_without_mother[dj], - x_end_without_mother[dj])) - columns_not_covered = list(all_columns - columns_covered_by_mothers) - #print(columns_covered_by_mothers, "columns_covered_by_mothers") - #print(columns_not_covered, "columns_not_covered") - y_mid = np.append(y_mid, np.ones(len(columns_not_covered) + - len(x_start_without_mother), - dtype=int) * top) - ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered)) - ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) - x_starting = np.append(x_starting, np.array(columns_not_covered, int)) - x_starting = np.append(x_starting, x_start_without_mother) - x_ending = np.append(x_ending, np.array(columns_not_covered, int) + 1) - x_ending = np.append(x_ending, x_end_without_mother) - - columns_covered_by_mothers_with_child = set() - for dj in range(len(x_end_with_child_without_mother)): - columns_covered_by_mothers_with_child.update( - range(x_start_with_child_without_mother[dj], - x_end_with_child_without_mother[dj])) - #print(columns_covered_by_mothers_with_child, "columns_covered_by_mothers_with_child") - columns_not_covered_by_mothers_with_child = list( - all_columns - columns_covered_by_mothers_with_child) - #indexes_to_be_spanned=[] - for i_s in range(len(x_end_with_child_without_mother)): - columns_not_covered_by_mothers_with_child.append(x_start_with_child_without_mother[i_s]) - columns_not_covered_by_mothers_with_child = np.sort(columns_not_covered_by_mothers_with_child) - #print(columns_not_covered_by_mothers_with_child, "columns_not_covered_by_mothers_with_child") - ind_args = np.arange(len(y_mid)) - for i_s_nc in columns_not_covered_by_mothers_with_child: - if i_s_nc in x_start_with_child_without_mother: - # use only seps with mother's span ("biggest") - #print("i_s_nc", i_s_nc) - x_end_biggest_column = \ - x_end_with_child_without_mother[ - x_start_with_child_without_mother == i_s_nc][0] - args_all_biggest_seps = \ - ind_args[(x_starting == i_s_nc) & - (x_ending == x_end_biggest_column)] - y_mid_column_nc = y_mid[args_all_biggest_seps] - #print("%d:%d" % (i_s_nc, x_end_biggest_column), "columns covered by mother with child") - #x_start_column_nc = x_starting[args_all_biggest_seps] - #x_end_column_nc = x_ending[args_all_biggest_seps] - y_mid_column_nc = np.sort(y_mid_column_nc) - #print(y_mid_column_nc, "y_mid_column_nc (sorted)") - for nc_top, nc_bot in pairwise(np.append(y_mid_column_nc, bot)): - #print("i_c", i_c) - #print("%d:%d" % (nc_top, nc_bot), "y_mid_column_nc") - ind_all_seps_between_nm_wc = \ - ind_args[(y_mid > nc_top) & - (y_mid < nc_bot) & - (x_starting >= i_s_nc) & - (x_ending <= x_end_biggest_column)] - y_mid_all_between_nm_wc = y_mid[ind_all_seps_between_nm_wc] - x_starting_all_between_nm_wc = x_starting[ind_all_seps_between_nm_wc] - x_ending_all_between_nm_wc = x_ending[ind_all_seps_between_nm_wc] - - columns_covered_by_mothers = set() - for dj in range(len(ind_all_seps_between_nm_wc)): - columns_covered_by_mothers.update( - range(x_starting_all_between_nm_wc[dj], - x_ending_all_between_nm_wc[dj])) - #print(columns_covered_by_mothers, "columns_covered_by_mothers") - child_columns = set(range(i_s_nc, x_end_biggest_column)) - columns_not_covered = list(child_columns - columns_covered_by_mothers) - #print(child_columns, "child_columns") - #print(columns_not_covered, "columns_not_covered") - - if len(ind_all_seps_between_nm_wc): - biggest = np.argmax(x_ending_all_between_nm_wc - - x_starting_all_between_nm_wc) - #print(ind_all_seps_between_nm_wc, "ind_all_seps_between_nm_wc") - #print(biggest, "%d:%d" % (x_starting_all_between_nm_wc[biggest], - x_ending_all_between_nm_wc[biggest]), "biggest") - if columns_covered_by_mothers == set( - range(x_starting_all_between_nm_wc[biggest], - x_ending_all_between_nm_wc[biggest])): - # single biggest accounts for all covered columns alone, - # this separator should be extended to cover all - seps_too_close_to_top_separator = \ - ((y_mid_all_between_nm_wc > nc_top) & - (y_mid_all_between_nm_wc <= nc_top + 500)) - if (np.count_nonzero(seps_too_close_to_top_separator) and - np.count_nonzero(seps_too_close_to_top_separator) < - len(ind_all_seps_between_nm_wc)): - #print(seps_too_close_to_top_separator, "seps_too_close_to_top_separator") - y_mid_all_between_nm_wc = \ - y_mid_all_between_nm_wc[~seps_too_close_to_top_separator] - x_starting_all_between_nm_wc = \ - x_starting_all_between_nm_wc[~seps_too_close_to_top_separator] - x_ending_all_between_nm_wc = \ - x_ending_all_between_nm_wc[~seps_too_close_to_top_separator] - - y_mid_all_between_nm_wc = np.append( - y_mid_all_between_nm_wc, nc_top) - x_starting_all_between_nm_wc = np.append( - x_starting_all_between_nm_wc, i_s_nc) - x_ending_all_between_nm_wc = np.append( - x_ending_all_between_nm_wc, x_end_biggest_column) - else: - y_mid_all_between_nm_wc = np.append( - y_mid_all_between_nm_wc, nc_top) - x_starting_all_between_nm_wc = np.append( - x_starting_all_between_nm_wc, x_starting_all_between_nm_wc[biggest]) - x_ending_all_between_nm_wc = np.append( - x_ending_all_between_nm_wc, x_ending_all_between_nm_wc[biggest]) - - if len(columns_not_covered): - y_mid_all_between_nm_wc = np.append( - y_mid_all_between_nm_wc, [nc_top] * len(columns_not_covered)) - x_starting_all_between_nm_wc = np.append( - x_starting_all_between_nm_wc, np.array(columns_not_covered, int)) - x_ending_all_between_nm_wc = np.append( - x_ending_all_between_nm_wc, np.array(columns_not_covered, int) + 1) - - ind_args_between=np.arange(len(x_ending_all_between_nm_wc)) - for column in range(int(i_s_nc), int(x_end_biggest_column)): - ind_args_in_col=ind_args_between[x_starting_all_between_nm_wc==column] - #print('babali2') - #print(ind_args_in_col,'ind_args_in_col') - #print(len(y_mid)) - y_mid_column=y_mid_all_between_nm_wc[ind_args_in_col] - x_start_column=x_starting_all_between_nm_wc[ind_args_in_col] - x_end_column=x_ending_all_between_nm_wc[ind_args_in_col] - #print('babali3') - ind_args_col_sorted=np.argsort(y_mid_column) - y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) - x_start_by_order.extend(x_start_column[ind_args_col_sorted]) - x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) - else: - #print(i_s_nc,'column not covered by mothers with child') - ind_args_in_col=ind_args[x_starting==i_s_nc] - #print('babali2') - #print(ind_args_in_col,'ind_args_in_col') - #print(len(y_mid)) - y_mid_column=y_mid[ind_args_in_col] - x_start_column=x_starting[ind_args_in_col] - x_end_column=x_ending[ind_args_in_col] - #print('babali3') - ind_args_col_sorted = np.argsort(y_mid_column) - y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) - x_start_by_order.extend(x_start_column[ind_args_col_sorted]) - x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) - - # create single-column boxes from multi-column separators - y_mid_by_order = np.array(y_mid_by_order) - x_start_by_order = np.array(x_start_by_order) - x_end_by_order = np.array(x_end_by_order) - for il in range(len(y_mid_by_order)): - #print(il, "il") - y_mid_itself = y_mid_by_order[il] - x_start_itself = x_start_by_order[il] - x_end_itself = x_end_by_order[il] - for column in range(int(x_start_itself), int(x_end_itself)+1): - #print(column,'cols') - #print('burda') - #print('burda2') - y_mid_next = y_mid_by_order[(y_mid_itself < y_mid_by_order) & - (column >= x_start_by_order) & - (column <= x_end_by_order)] - y_mid_next = y_mid_next.min(initial=bot) - #print(y_mid_next,'y_mid_next') - #print(y_mid_itself,'y_mid_itself') + # core algorithm: + # 1. iterate through multi-column separators, pre-ordered by their y coord + # 2. for each separator, iterate from its starting to its ending column + # 3. in each starting column, determine the next downwards separator, + # 4. if there is none, then fill up the column to the bottom; + # otherwise, fill up to that next separator + # 5. moreover, determine the next rightward column that would not cut through + # any regions, advancing to that column, and storing a new in-order bbox + # for that down/right span + # 6. if there was a next separator, and it ends no further than the current one, + # then recurse on that separator from step 1, then continue (with the next + # column for the current separator) at step 2, or (with the next separator + # in order) at step 1 + args = list(range(len(y_mid))) + while len(args): + cur = args[0] + args = args[1:] + # print("iter", cur, y_mid[cur], "%d:%d" % (x_starting[cur], x_ending[cur])) + def get_span(start, y_top, y_bot): + # for last, l_top, l_bot, l_count in labelcolmap.get(start, []): + # if y_top < l_bot and y_bot > l_top and last > start + 1: + # width = (peaks_neg_tot[last] - peaks_neg_tot[start]) + # print("span", start, last, l_top, l_bot, l_count, + # "box area", (y_bot - y_top) * width, + # "label area", (min(y_bot, l_bot) - max(y_top, l_top)) * width, + # "box height", (y_bot - y_top), + # "label height", sum(regions_without_separators[ + # y_top: y_bot, peaks_neg_tot[start + 1]])) + return min((last for last, l_top, l_bot, l_count in labelcolmap.get(start, []) + # yield the right-most column that does not cut through + # any regions in this horizontal span + if y_top < l_bot and y_bot > l_top + # Ignore if it ends here, anyway + and last > start + 1 + # Ensure this is not just a tiny region near larger regions + and l_count > 0.1 * max(l_count2 for _, l_top2, l_bot2, l_count2 in labelcolmap[start] + if y_top < l_bot2 and y_bot > l_top2) + # or just a small cut of the respective region + # (i.e. box should cover at least 10% of the label). + and ((min(y_bot, l_bot) - max(y_top, l_top)) * + (peaks_neg_tot[last] - peaks_neg_tot[start])) > 0.1 * l_count + # But do allow cutting tiny passages with less 10% of height + # (i.e. label is already almost separated by columns) + and sum(regions_without_separators[ + y_top: y_bot, peaks_neg_tot[start + 1]]) > 0.1 * (y_bot - y_top)), + # Otherwise advance only 1 column. + default=start + 1) + def add_sep(cur): + column = x_starting[cur] + while column < x_ending[cur]: + nxt = np.flatnonzero((y_mid[cur] < y_mid) & + (column >= x_starting) & + (column < x_ending)) + if len(nxt): + nxt = nxt[0] + # print("column", column) + last = get_span(column, y_max[cur], y_min[nxt]) + last = min(last, x_ending[nxt], x_ending[cur]) + # print("nxt", nxt, y_mid[nxt], "%d:%d" % (column, last)) boxes.append([peaks_neg_tot[column], - peaks_neg_tot[column+1], - y_mid_itself, - y_mid_next]) - # dbg_plt(boxes[-1], "A column %d box" % (column + 1)) - except: - logger.exception("cannot assign boxes") - boxes.append([0, peaks_neg_tot[len(peaks_neg_tot)-1], - top, bot]) - # dbg_plt(boxes[-1], "fallback box") - else: - # order multi-column separators - y_mid_by_order=[] - x_start_by_order=[] - x_end_by_order=[] - if len(x_starting)>0: - columns_covered_by_seps_covered_more_than_2col = set() - for dj in range(len(x_starting)): - if set(range(x_starting[dj], x_ending[dj])) != all_columns: - columns_covered_by_seps_covered_more_than_2col.update( - range(x_starting[dj], x_ending[dj])) - columns_not_covered = list(all_columns - columns_covered_by_seps_covered_more_than_2col) - - y_mid = np.append(y_mid, np.ones(len(columns_not_covered) + 1, - dtype=int) * top) - ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered)) - ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) - x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype)) - x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1) - if len(new_main_sep_y) > 0: - x_starting = np.append(x_starting, 0) - x_ending = np.append(x_ending, len(peaks_neg_tot) - 1) - else: - x_starting = np.append(x_starting, x_starting[0]) - x_ending = np.append(x_ending, x_ending[0]) - else: - columns_not_covered = list(all_columns) - y_mid = np.append(y_mid, np.ones(len(columns_not_covered), - dtype=int) * top) - ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered)) - ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered)) - x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype)) - x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1) - - ind_args = np.arange(len(y_mid)) - - for column in range(len(peaks_neg_tot)-1): - #print(column,'column') - ind_args_in_col=ind_args[x_starting==column] - #print(len(y_mid)) - y_mid_column=y_mid[ind_args_in_col] - x_start_column=x_starting[ind_args_in_col] - x_end_column=x_ending[ind_args_in_col] - - ind_args_col_sorted = np.argsort(y_mid_column) - y_mid_by_order.extend(y_mid_column[ind_args_col_sorted]) - x_start_by_order.extend(x_start_column[ind_args_col_sorted]) - x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1) - - # create single-column boxes from multi-column separators - y_mid_by_order = np.array(y_mid_by_order) - x_start_by_order = np.array(x_start_by_order) - x_end_by_order = np.array(x_end_by_order) - for il in range(len(y_mid_by_order)): - #print(il, "il") - y_mid_itself = y_mid_by_order[il] - #print(y_mid_itself,'y_mid_itself') - x_start_itself = x_start_by_order[il] - x_end_itself = x_end_by_order[il] - for column in range(x_start_itself, x_end_itself+1): - #print(column,'cols') - #print('burda2') - y_mid_next = y_mid_by_order[(y_mid_itself < y_mid_by_order) & - (column >= x_start_by_order) & - (column <= x_end_by_order)] - #print(y_mid_next,'y_mid_next') - y_mid_next = y_mid_next.min(initial=bot) - #print(y_mid_next,'y_mid_next') - boxes.append([peaks_neg_tot[column], - peaks_neg_tot[column+1], - y_mid_itself, - y_mid_next]) - # dbg_plt(boxes[-1], "B column %d box" % (column + 1)) + peaks_neg_tot[last], + y_mid[cur], + y_mid[nxt]]) + # dbg_plt(boxes[-1], "recursive column %d:%d box [%d]" % (column, last, len(boxes))) + column = last + if last == x_ending[nxt] and x_ending[nxt] <= x_ending[cur] and nxt in args: + # child – recur + # print("recur", nxt, y_mid[nxt], "%d:%d" % (x_starting[nxt], x_ending[nxt])) + args.remove(nxt) + add_sep(nxt) + else: + # print("column", column) + last = get_span(column, y_max[cur], bot) + # print("bot", bot, "%d:%d" % (column, last)) + boxes.append([peaks_neg_tot[column], + peaks_neg_tot[last], + y_mid[cur], + bot]) + # dbg_plt(boxes[-1], "non-recursive column %d box [%d]" % (column, len(boxes))) + column = last + add_sep(cur) if right2left_readingorder: peaks_neg_tot_tables_new = [] From 4475183f08d2c25eb90deb04bda552930abd4ba0 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 14 Nov 2025 03:39:36 +0100 Subject: [PATCH 025/118] improve rules governing column split - reduce `sigma` for smoothing of input to `find_peaks` (so we get deeper gaps between columns) - allow column boundaries closer to the margins (50 instead of 100 or 200 px, 170 instead of 370 px) - allow column boundaries closer to each other (300 instead of 400 px) - add a secondary `grenze` criterion for depth of gap (relative to lowest minimum, if that is smaller than the old criterion relative to lowest maximum) - for calls to `find_num_col` within parts of a page, do allow unbalanced column boundaries --- src/eynollah/utils/__init__.py | 113 +++++++++++++++++++-------------- 1 file changed, 66 insertions(+), 47 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index e00004f..570eefe 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -241,7 +241,7 @@ def find_num_col_deskew(regions_without_separators, sigma_, multiplier=3.8): z = gaussian_filter1d(regions_without_separators_0, sigma_) return np.std(z) -def find_num_col(regions_without_separators, num_col_classifier, tables, multiplier=3.8): +def find_num_col(regions_without_separators, num_col_classifier, tables, multiplier=3.8, unbalanced=False): if not regions_without_separators.any(): return 0, [] regions_without_separators_0 = regions_without_separators.sum(axis=0) @@ -249,13 +249,15 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl # ax1.imshow(regions_without_separators, aspect="auto") # ax2.plot(regions_without_separators_0) # plt.show() - sigma_ = 35 # 70#35 + sigma_ = 25 # 70#35 meda_n_updown = regions_without_separators_0[::-1] first_nonzero = next((i for i, x in enumerate(regions_without_separators_0) if x), 0) last_nonzero = next((i for i, x in enumerate(meda_n_updown) if x), 0) last_nonzero = len(regions_without_separators_0) - last_nonzero - last_nonzero = last_nonzero - 100 - first_nonzero = first_nonzero + 200 + last_nonzero = last_nonzero - 50 #- 100 + first_nonzero = first_nonzero + 50 #+ 200 + last_offmargin = len(regions_without_separators_0) - 170 #370 + first_offmargin = 170 #370 y = regions_without_separators_0 # [first_nonzero:last_nonzero] y_help = np.zeros(len(y) + 20) y_help[10 : len(y) + 10] = y @@ -285,26 +287,34 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl # ax2.axvline(last_nonzero, label="last nonzero") # ax2.text(first_nonzero, 0, "first nonzero", rotation=90) # ax2.text(last_nonzero, 0, "last nonzero", rotation=90) - # ax2.axvline(370, label="first") - # ax2.axvline(len(y) - 370, label="last") - # ax2.text(370, 0, "first", rotation=90) - # ax2.text(len(y) - 370, 0, "last", rotation=90) + # ax2.axvline(first_offmargin, label="first offmargin") + # ax2.axvline(last_offmargin, label="last offmargin") + # ax2.text(first_offmargin, 0, "first offmargin", rotation=90) + # ax2.text(last_offmargin, 0, "last offmargin", rotation=90) # plt.show() peaks_neg = peaks_neg - 10 - 10 + # print("raw peaks", peaks) peaks = peaks[(peaks > 0.06 * len(y)) & (peaks < 0.94 * len(y))] + # print("non-marginal peaks", peaks) interest_pos = z[peaks] + # print("interest_pos", interest_pos) interest_pos = interest_pos[interest_pos > 10] if not interest_pos.any(): return 0, [] + # plt.plot(z) # plt.show() + #print("raw peaks_neg", peaks_neg) peaks_neg = peaks_neg[(peaks_neg > first_nonzero) & (peaks_neg < last_nonzero)] - peaks_neg = peaks_neg[(peaks_neg > 370) & - (peaks_neg < len(y) - 370)] + #print("non-zero peaks_neg", peaks_neg) + peaks_neg = peaks_neg[(peaks_neg > first_offmargin) & + (peaks_neg < last_offmargin)] + #print("non-marginal peaks_neg", peaks_neg) interest_neg = z[peaks_neg] + #print("interest_neg", interest_neg) if not interest_neg.any(): return 0, [] @@ -317,10 +327,14 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl min_peaks_neg = 0 # np.min(interest_neg) + # cutoff criterion: fixed fraction of lowest column height dis_talaei = (min_peaks_pos - min_peaks_neg) / multiplier grenze = min_peaks_pos - dis_talaei #np.mean(y[peaks_neg[0]:peaks_neg[-1]])-np.std(y[peaks_neg[0]:peaks_neg[-1]])/2.0 + # extra criterion: fixed multiple of lowest gap height + grenze = min(grenze, multiplier * (5 + np.min(interest_neg))) + # print(interest_neg,'interest_neg') # print(grenze,'grenze') # print(min_peaks_pos,'min_peaks_pos') @@ -356,18 +370,20 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl # print(peaks_neg_fin,'peaks_neg_fin') # print(num_col,'diz') # cancel if resulting split is highly unbalanced across available width - if ((num_col == 3 and - ((peaks_neg_fin[0] > 0.75 * len(y) and - peaks_neg_fin[1] > 0.75 * len(y)) or - (peaks_neg_fin[0] < 0.25 * len(y) and - peaks_neg_fin[1] < 0.25 * len(y)) or - (peaks_neg_fin[0] < 0.5 * len(y) - 200 and - peaks_neg_fin[1] < 0.5 * len(y)) or - (peaks_neg_fin[0] > 0.5 * len(y) + 200 and - peaks_neg_fin[1] > 0.5 * len(y)))) or - (num_col == 2 and - (peaks_neg_fin[0] > 0.75 * len(y) or - peaks_neg_fin[0] < 0.25 * len(y)))): + if unbalanced: + pass + elif ((num_col == 3 and + ((peaks_neg_fin[0] > 0.75 * len(y) and + peaks_neg_fin[1] > 0.75 * len(y)) or + (peaks_neg_fin[0] < 0.25 * len(y) and + peaks_neg_fin[1] < 0.25 * len(y)) or + (peaks_neg_fin[0] < 0.5 * len(y) - 200 and + peaks_neg_fin[1] < 0.5 * len(y)) or + (peaks_neg_fin[0] > 0.5 * len(y) + 200 and + peaks_neg_fin[1] > 0.5 * len(y)))) or + (num_col == 2 and + (peaks_neg_fin[0] > 0.75 * len(y) or + peaks_neg_fin[0] < 0.25 * len(y)))): num_col = 1 peaks_neg_fin = [] @@ -376,7 +392,7 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl # filter out peaks that are too close (<400px) to each other: # among each group, pick the position with smallest amount of text diff_peaks = np.abs(np.diff(peaks_neg_fin)) - cut_off = 400 + cut_off = 300 #400 peaks_neg_true = [] forest = [] # print(len(peaks_neg_fin),'len_') @@ -401,30 +417,32 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl #print(peaks_neg_true, "peaks_neg_true") ##print(num_col,'early') # cancel if resulting split is highly unbalanced across available width - if ((num_col == 3 and - ((peaks_neg_true[0] > 0.75 * len(y) and - peaks_neg_true[1] > 0.75 * len(y)) or - (peaks_neg_true[0] < 0.25 * len(y) and - peaks_neg_true[1] < 0.25 * len(y)) or - (peaks_neg_true[0] < 0.5 * len(y) - 200 and - peaks_neg_true[1] < 0.5 * len(y)) or - (peaks_neg_true[0] > 0.5 * len(y) + 200 and - peaks_neg_true[1] > 0.5 * len(y)))) or - (num_col == 2 and - (peaks_neg_true[0] > 0.75 * len(y) or - peaks_neg_true[0] < 0.25 * len(y)))): + if unbalanced: + pass + elif ((num_col == 3 and + ((peaks_neg_true[0] > 0.75 * len(y) and + peaks_neg_true[1] > 0.75 * len(y)) or + (peaks_neg_true[0] < 0.25 * len(y) and + peaks_neg_true[1] < 0.25 * len(y)) or + (peaks_neg_true[0] < 0.5 * len(y) - 200 and + peaks_neg_true[1] < 0.5 * len(y)) or + (peaks_neg_true[0] > 0.5 * len(y) + 200 and + peaks_neg_true[1] > 0.5 * len(y)))) or + (num_col == 2 and + (peaks_neg_true[0] > 0.75 * len(y) or + peaks_neg_true[0] < 0.25 * len(y)))): num_col = 1 peaks_neg_true = [] - if (num_col == 3 and - (peaks_neg_true[0] < 0.75 * len(y) and - peaks_neg_true[0] > 0.25 * len(y) and - peaks_neg_true[1] > 0.80 * len(y))): + elif (num_col == 3 and + (peaks_neg_true[0] < 0.75 * len(y) and + peaks_neg_true[0] > 0.25 * len(y) and + peaks_neg_true[1] > 0.80 * len(y))): num_col = 2 peaks_neg_true = [peaks_neg_true[0]] - if (num_col == 3 and - (peaks_neg_true[1] < 0.75 * len(y) and - peaks_neg_true[1] > 0.25 * len(y) and - peaks_neg_true[0] < 0.20 * len(y))): + elif (num_col == 3 and + (peaks_neg_true[1] < 0.75 * len(y) and + peaks_neg_true[1] > 0.25 * len(y) and + peaks_neg_true[0] < 0.20 * len(y))): num_col = 2 peaks_neg_true = [peaks_neg_true[1]] @@ -1151,8 +1169,8 @@ def order_of_regions(textline_mask, contours_main, contours_head, y_ref, x_ref): ##matrix_of_orders[:len_main,4]=final_indexers_sorted[:] - # assert len(final_indexers_sorted) == len(contours_main) + len(contours_head) - # assert not len(final_indexers_sorted) or max(final_index_type) == max(len(contours_main) + assert len(set(final_indexers_sorted)) == len(contours_main) + len(contours_head) + assert set(final_index_type) == set(range(len(contours_main))).union(range(len(contours_head))) return np.array(final_indexers_sorted), np.array(final_types), np.array(final_index_type) @@ -1518,7 +1536,8 @@ def return_boxes_of_images_by_order_of_reading_new( regions_without_separators[top:bot], # we do not expect to get all columns in small parts (headings etc.): num_col_classifier if bot - top >= big_part else 1, - tables, multiplier=6. if erosion_hurts else 7.) + tables, multiplier=6. if erosion_hurts else 7., + unbalanced=True) except: peaks_neg_fin=[] num_col = 0 @@ -1534,7 +1553,7 @@ def return_boxes_of_images_by_order_of_reading_new( if len(peaks_neg_fin)==0: num_col, peaks_neg_fin = find_num_col( regions_without_separators[top:bot], - num_col_classifier, tables, multiplier=3.) + num_col_classifier, tables, multiplier=3., unbalanced=True) #print(peaks_neg_fin,'peaks_neg_fin') peaks_neg_fin_early = [0] + peaks_neg_fin + [width_tot-1] From 3c15c4f7d4bf03fee11c54da82ba7d29f09ada5a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 15 Nov 2025 14:29:41 +0100 Subject: [PATCH 026/118] back to `rotate_image` instead of `rotation_image_new` for deskewing (because the latter does not preserve coordinates; it scales, even when resizing the image; this caused coordinate problems when matching deskewed contours) --- src/eynollah/eynollah.py | 58 +++++++++------------------------------- 1 file changed, 13 insertions(+), 45 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 2bdb2c7..efd67d5 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -88,12 +88,7 @@ from .utils.contour import ( join_polygons, make_intersection, ) -from .utils.rotate import ( - rotate_image, - rotation_not_90_func, - rotation_not_90_func_full_layout, - rotation_image_new -) +from .utils.rotate import rotate_image from .utils.utils_ocr import ( return_start_and_end_of_common_text_of_textline_ocr_without_common_section, return_textline_contour_with_added_box_coordinate, @@ -3131,11 +3126,9 @@ class Eynollah: self.logger.debug('enter run_boxes_no_full_layout') t_0_box = time.time() if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - _, textline_mask_tot_d, text_regions_p_d, table_prediction_n = rotation_not_90_func( - image_page, textline_mask_tot, text_regions_p, table_prediction, slope_deskew) - text_regions_p_d = resize_image(text_regions_p_d, text_regions_p.shape[0], text_regions_p.shape[1]) - textline_mask_tot_d = resize_image(textline_mask_tot_d, text_regions_p.shape[0], text_regions_p.shape[1]) - table_prediction_n = resize_image(table_prediction_n, text_regions_p.shape[0], text_regions_p.shape[1]) + textline_mask_tot_d = rotate_image(textline_mask_tot, slope_deskew) + text_regions_p_d = rotate_image(text_regions_p, slope_deskew) + table_prediction_n = rotate_image(table_prediction, slope_deskew) regions_without_separators_d = (text_regions_p_d[:, :] == 1) * 1 if self.tables: regions_without_separators_d[table_prediction_n[:,:] == 1] = 1 @@ -3276,20 +3269,9 @@ class Eynollah: text_regions_p[:,:][table_prediction[:,:]==1] = 10 img_revised_tab = text_regions_p[:,:] if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - _, textline_mask_tot_d, text_regions_p_d, table_prediction_n = \ - rotation_not_90_func(image_page, textline_mask_tot, text_regions_p, - table_prediction, slope_deskew) - - text_regions_p_d = resize_image(text_regions_p_d, - text_regions_p.shape[0], - text_regions_p.shape[1]) - textline_mask_tot_d = resize_image(textline_mask_tot_d, - text_regions_p.shape[0], - text_regions_p.shape[1]) - table_prediction_n = resize_image(table_prediction_n, - text_regions_p.shape[0], - text_regions_p.shape[1]) - + textline_mask_tot_d = rotate_image(textline_mask_tot, slope_deskew) + text_regions_p_d = rotate_image(text_regions_p, slope_deskew) + table_prediction_n = rotate_image(table_prediction, slope_deskew) regions_without_separators_d = (text_regions_p_d[:,:] == 1)*1 regions_without_separators_d[table_prediction_n[:,:] == 1] = 1 else: @@ -3303,20 +3285,9 @@ class Eynollah: else: if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - _, textline_mask_tot_d, text_regions_p_d, table_prediction_n = \ - rotation_not_90_func(image_page, textline_mask_tot, text_regions_p, - table_prediction, slope_deskew) - - text_regions_p_d = resize_image(text_regions_p_d, - text_regions_p.shape[0], - text_regions_p.shape[1]) - textline_mask_tot_d = resize_image(textline_mask_tot_d, - text_regions_p.shape[0], - text_regions_p.shape[1]) - table_prediction_n = resize_image(table_prediction_n, - text_regions_p.shape[0], - text_regions_p.shape[1]) - + textline_mask_tot_d = rotate_image(textline_mask_tot, slope_deskew) + text_regions_p_d = rotate_image(text_regions_p, slope_deskew) + table_prediction_n = rotate_image(table_prediction, slope_deskew) regions_without_separators_d = (text_regions_p_d[:,:] == 1)*1 regions_without_separators_d[table_prediction_n[:,:] == 1] = 1 else: @@ -3465,12 +3436,9 @@ class Eynollah: #plt.show() ####if not self.tables: if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - _, textline_mask_tot_d, text_regions_p_d, regions_fully_n = rotation_not_90_func_full_layout( - image_page, textline_mask_tot, text_regions_p, regions_fully, slope_deskew) - - text_regions_p_d = resize_image(text_regions_p_d, text_regions_p.shape[0], text_regions_p.shape[1]) - textline_mask_tot_d = resize_image(textline_mask_tot_d, text_regions_p.shape[0], text_regions_p.shape[1]) - regions_fully_n = resize_image(regions_fully_n, text_regions_p.shape[0], text_regions_p.shape[1]) + textline_mask_tot_d = rotate_image(textline_mask_tot, slope_deskew) + text_regions_p_d = rotate_image(text_regions_p, slope_deskew) + regions_fully_n = rotate_image(regions_fully, slope_deskew) if not self.tables: regions_without_separators_d = (text_regions_p_d[:, :] == 1) * 1 else: From 5a778003fde3cc540f3b8b1c00bc6eebee1f9295 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 15 Nov 2025 14:32:22 +0100 Subject: [PATCH 027/118] contour matching for deskewed image: ensure matches for both sides --- src/eynollah/eynollah.py | 42 +++++++++++++++++++++++++++++++--------- 1 file changed, 33 insertions(+), 9 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index efd67d5..b7c6ddf 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -4461,42 +4461,42 @@ class Eynollah: dists[i] = np.linalg.norm(centers[:, i:i + 1] - centers_d, axis=0) corresp = np.zeros(dists.shape, dtype=bool) # keep searching next-closest until at least one correspondence on each side - while not np.all(corresp.sum(axis=1)) and not np.all(corresp.sum(axis=0)): + while not np.all(corresp.sum(axis=1)) or not np.all(corresp.sum(axis=0)): idx = np.nanargmin(dists) i, j = np.unravel_index(idx, dists.shape) dists[i, j] = np.nan corresp[i, j] = True - #print("original/deskewed adjacency", corresp.nonzero()) + # print("original/deskewed adjacency", corresp.nonzero()) contours_only_text_parent_d_ordered = np.zeros_like(contours_only_text_parent) contours_only_text_parent_d_ordered = contours_only_text_parent_d[np.argmax(corresp, axis=1)] # img1 = np.zeros(text_only_d.shape[:2], dtype=np.uint8) # for i in range(len(contours_only_text_parent)): # cv2.fillPoly(img1, pts=[contours_only_text_parent_d_ordered[i]], color=i + 1) - # plt.subplot(2, 2, 1, title="direct corresp contours") + # plt.subplot(1, 4, 1, title="direct corresp contours") # plt.imshow(img1) # img2 = np.zeros(text_only_d.shape[:2], dtype=np.uint8) # join deskewed regions mapping to single original ones for i in range(len(contours_only_text_parent)): if np.count_nonzero(corresp[i]) > 1: indices = np.flatnonzero(corresp[i]) - #print("joining", indices) + # print("joining", indices) polygons_d = [contour2polygon(contour) for contour in contours_only_text_parent_d[indices]] contour_d = polygon2contour(join_polygons(polygons_d)) contours_only_text_parent_d_ordered[i] = contour_d # cv2.fillPoly(img2, pts=[contour_d], color=i + 1) - # plt.subplot(2, 2, 3, title="joined contours") + # plt.subplot(1, 4, 2, title="joined contours") # plt.imshow(img2) # img3 = np.zeros(text_only_d.shape[:2], dtype=np.uint8) # split deskewed regions mapping to multiple original ones def deskew(polygon): polygon = shapely.affinity.rotate(polygon, -slope_deskew, origin=center) - polygon = shapely.affinity.translate(polygon, *offset.squeeze()) + #polygon = shapely.affinity.translate(polygon, *offset.squeeze()) return polygon for j in range(len(contours_only_text_parent_d)): if np.count_nonzero(corresp[:, j]) > 1: indices = np.flatnonzero(corresp[:, j]) - #print("splitting along", indices) + # print("splitting along", indices) polygons = [deskew(contour2polygon(contour)) for contour in contours_only_text_parent[indices]] polygon_d = contour2polygon(contours_only_text_parent_d[j]) @@ -4509,14 +4509,38 @@ class Eynollah: if polygon_d] contours_only_text_parent_d_ordered[indices] = contours_d # cv2.fillPoly(img3, pts=contours_d, color=j + 1) - # plt.subplot(2, 2, 4, title="split contours") + # plt.subplot(1, 4, 3, title="split contours") # plt.imshow(img3) # img4 = np.zeros(text_only_d.shape[:2], dtype=np.uint8) # for i in range(len(contours_only_text_parent)): # cv2.fillPoly(img4, pts=[contours_only_text_parent_d_ordered[i]], color=i + 1) - # plt.subplot(2, 2, 2, title="result contours") + # plt.subplot(1, 4, 4, title="result contours") # plt.imshow(img4) # plt.show() + # from matplotlib import patches as ptchs + # plt.subplot(1, 2, 1, title="undeskewed") + # plt.imshow(text_only) + # centers = np.stack(find_center_of_contours(contours_only_text_parent)) # [2, N] + # for i in range(len(contours_only_text_parent)): + # cnt = contours_only_text_parent[i] + # ctr = centers[:, i] + # plt.gca().add_patch(ptchs.Polygon(cnt[:, 0], closed=False, fill=False, color='blue')) + # plt.gca().scatter(ctr[0], ctr[1], 20, c='blue', marker='x') + # plt.gca().text(ctr[0], ctr[1], str(i), c='blue') + # plt.subplot(1, 2, 2, title="deskewed") + # plt.imshow(text_only_d) + # centers_d = np.stack(find_center_of_contours(contours_only_text_parent_d_ordered)) # [2, N] + # for i in range(len(contours_only_text_parent)): + # cnt = contours_only_text_parent[i] + # cnt = polygon2contour(deskew(contour2polygon(cnt))) + # plt.gca().add_patch(ptchs.Polygon(cnt[:, 0], closed=False, fill=False, color='blue')) + # for i in range(len(contours_only_text_parent_d_ordered)): + # cnt = contours_only_text_parent_d_ordered[i] + # ctr = centers_d[:, i] + # plt.gca().add_patch(ptchs.Polygon(cnt[:, 0], closed=False, fill=False, color='red')) + # plt.gca().scatter(ctr[0], ctr[1], 20, c='red', marker='x') + # plt.gca().text(ctr[0], ctr[1], str(i), c='red') + # plt.show() if not len(contours_only_text_parent): # stop early From 72d059f3c973b942945b62d4463a6ea031043efc Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 15 Nov 2025 14:34:12 +0100 Subject: [PATCH 028/118] reading order: simplify assignment / counting - `do_order_of_regions`: simplify aggregating per-box orders for paragraphs and headings to overall order passed to `xml_reading_order`; no need for `order_and_id_of_texts`, no need to return `id_of_texts_tot` - `do_order_of_regions_with_model`: no need to return `region_ids` - writer: no need to pass `id_of_texts_tot` in `build_pagexml` --- src/eynollah/eynollah.py | 70 +++++++++++++--------------------- src/eynollah/utils/__init__.py | 1 + src/eynollah/writer.py | 6 +-- 3 files changed, 30 insertions(+), 47 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index b7c6ddf..6024646 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -134,7 +134,6 @@ from .utils import ( return_boxes_of_images_by_order_of_reading_new ) from .utils.pil_cv2 import check_dpi, pil2cv -from .utils.xml import order_and_id_of_texts from .plot import EynollahPlotter from .writer import EynollahXmlWriter @@ -2546,9 +2545,7 @@ class Eynollah: args_contours_head = np.arange(len(contours_only_text_parent_h)) order_by_con_head = np.zeros_like(arg_text_con_head) - ref_point = 0 - order_of_texts_tot = [] - id_of_texts_tot = [] + idx = 0 for iij, box in enumerate(boxes): ys = slice(*box[2:4]) xs = slice(*box[0:2]) @@ -2557,37 +2554,25 @@ class Eynollah: con_inter_box = contours_only_text_parent[args_contours_box_main] con_inter_box_h = contours_only_text_parent_h[args_contours_box_head] - indexes_sorted, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( + _, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, box[2], box[0]) - order_of_texts, id_of_texts = order_and_id_of_texts( - con_inter_box, con_inter_box_h, - indexes_sorted, index_by_kind_sorted, kind_of_texts_sorted, ref_point) + for tidx, kind in zip(index_by_kind_sorted, kind_of_texts_sorted): + if kind == 1: + # print(iij, "main", args_contours_box_main[tidx], "becomes", idx) + order_by_con_main[args_contours_box_main[tidx]] = idx + else: + # print(iij, "head", args_contours_box_head[tidx], "becomes", idx) + order_by_con_head[args_contours_box_head[tidx]] = idx + idx += 1 - indexes_sorted_main = indexes_sorted[kind_of_texts_sorted == 1] - indexes_by_type_main = index_by_kind_sorted[kind_of_texts_sorted == 1] - indexes_sorted_head = indexes_sorted[kind_of_texts_sorted == 2] - indexes_by_type_head = index_by_kind_sorted[kind_of_texts_sorted == 2] - - for zahler, _ in enumerate(args_contours_box_main): - arg_order_v = indexes_sorted_main[zahler] - order_by_con_main[args_contours_box_main[indexes_by_type_main[zahler]]] = \ - np.flatnonzero(indexes_sorted == arg_order_v) + ref_point - - for zahler, _ in enumerate(args_contours_box_head): - arg_order_v = indexes_sorted_head[zahler] - order_by_con_head[args_contours_box_head[indexes_by_type_head[zahler]]] = \ - np.flatnonzero(indexes_sorted == arg_order_v) + ref_point - - for jji in range(len(id_of_texts)): - order_of_texts_tot.append(order_of_texts[jji] + ref_point) - id_of_texts_tot.append(id_of_texts[jji]) - ref_point += len(id_of_texts) - - order_of_texts_tot = np.concatenate((order_by_con_main, - order_by_con_head)) - order_text_new = np.argsort(order_of_texts_tot) - return order_text_new, id_of_texts_tot + # xml writer will create region ids in order of + # - contours_only_text_parent (main text), followed by + # - contours_only_text_parent (headings), + # and then create regionrefs into these ordered by order_text_new + order_text_new = np.argsort(np.concatenate((order_by_con_main, + order_by_con_head))) + return order_text_new try: results = match_boxes(False) @@ -3600,7 +3585,7 @@ class Eynollah: co_text_all = contours_only_text_parent if not len(co_text_all): - return [], [] + return [] labels_con = np.zeros((int(y_len /6.), int(x_len/6.), len(co_text_all)), dtype=bool) co_text_all = [(i/6).astype(int) for i in co_text_all] @@ -3683,11 +3668,9 @@ class Eynollah: else: org_contours_indexes.extend([indexes_of_located_cont[region_with_curr_order]]) - region_ids = ['region_%04d' % i for i in range(len(co_text_all_org))] - return org_contours_indexes, region_ids + return org_contours_indexes else: - region_ids = ['region_%04d' % i for i in range(len(co_text_all_org))] - return ordered, region_ids + return ordered def return_start_and_end_of_common_text_of_textline_ocr(self,textline_image, ind_tot): width = np.shape(textline_image)[1] @@ -4222,7 +4205,6 @@ class Eynollah: order_text_new = [0] slopes =[0] - id_of_texts_tot =['region_0001'] conf_contours_textregions =[0] if self.ocr and not self.tr: @@ -4234,7 +4216,7 @@ class Eynollah: ocr_all_textlines = None pcgts = self.writer.build_pagexml_no_full_layout( - cont_page, page_coord, order_text_new, id_of_texts_tot, + cont_page, page_coord, order_text_new, all_found_textline_polygons, page_coord, [], [], [], [], [], [], [], slopes, [], [], @@ -4736,14 +4718,14 @@ class Eynollah: self.logger.info("Headers ignored in reading order") if self.reading_order_machine_based: - order_text_new, id_of_texts_tot = self.do_order_of_regions_with_model( + order_text_new = self.do_order_of_regions_with_model( contours_only_text_parent, contours_only_text_parent_h, text_regions_p) else: if np.abs(slope_deskew) < SLOPE_THRESHOLD: - order_text_new, id_of_texts_tot = self.do_order_of_regions( + order_text_new = self.do_order_of_regions( contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot) else: - order_text_new, id_of_texts_tot = self.do_order_of_regions( + order_text_new = self.do_order_of_regions( contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered, boxes_d, textline_mask_tot_d) self.logger.info(f"Detection of reading order took {time.time() - t_order:.1f}s") @@ -4840,7 +4822,7 @@ class Eynollah: if self.full_layout: pcgts = self.writer.build_pagexml_full_layout( - contours_only_text_parent, contours_only_text_parent_h, page_coord, order_text_new, id_of_texts_tot, + contours_only_text_parent, contours_only_text_parent_h, page_coord, order_text_new, all_found_textline_polygons, all_found_textline_polygons_h, all_box_coord, all_box_coord_h, polygons_of_images, contours_tables, polygons_of_drop_capitals, polygons_of_marginals_left, polygons_of_marginals_right, @@ -4853,7 +4835,7 @@ class Eynollah: conf_contours_textregions, conf_contours_textregions_h) else: pcgts = self.writer.build_pagexml_no_full_layout( - contours_only_text_parent, page_coord, order_text_new, id_of_texts_tot, + contours_only_text_parent, page_coord, order_text_new, all_found_textline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals_left, polygons_of_marginals_right, all_found_textline_polygons_marginals_left, all_found_textline_polygons_marginals_right, diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 570eefe..20766a8 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1158,6 +1158,7 @@ def order_of_regions(textline_mask, contours_main, contours_head, y_ref, x_ref): # cnt = (contours_main if type_ == 1 else contours_head)[idx] # col = 'red' if type_ == 1 else 'blue' # plt.scatter(cx - x_ref, cy - y_ref, 20, c=col, marker='o') + # plt.text(cx - x_ref, cy - y_ref, str(idx), c=col) # plt.gca().add_patch(patches.Polygon(cnt[:, 0] - [[x_ref, y_ref]], closed=False, fill=False, color=col)) # plt.title("box contours centered in %d:%d (red=main / blue=heading)" % (top, bot)) # plt.show() diff --git a/src/eynollah/writer.py b/src/eynollah/writer.py index 9c3456a..f8aff62 100644 --- a/src/eynollah/writer.py +++ b/src/eynollah/writer.py @@ -89,7 +89,7 @@ class EynollahXmlWriter: def build_pagexml_no_full_layout( self, found_polygons_text_region, - page_coord, order_of_texts, id_of_texts, + page_coord, order_of_texts, all_found_textline_polygons, all_box_coord, found_polygons_text_region_img, @@ -102,7 +102,7 @@ class EynollahXmlWriter: **kwargs): return self.build_pagexml_full_layout( found_polygons_text_region, [], - page_coord, order_of_texts, id_of_texts, + page_coord, order_of_texts, all_found_textline_polygons, [], all_box_coord, [], found_polygons_text_region_img, found_polygons_tables, [], @@ -116,7 +116,7 @@ class EynollahXmlWriter: def build_pagexml_full_layout( self, found_polygons_text_region, found_polygons_text_region_h, - page_coord, order_of_texts, id_of_texts, + page_coord, order_of_texts, all_found_textline_polygons, all_found_textline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, From 49ab269e085505940a17c355905795d91777a451 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 15 Nov 2025 15:46:08 +0100 Subject: [PATCH 029/118] fix typos found by ruff --- src/eynollah/sbb_binarize.py | 2 +- src/eynollah/utils/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/eynollah/sbb_binarize.py b/src/eynollah/sbb_binarize.py index 0eab2ae..b81f45e 100644 --- a/src/eynollah/sbb_binarize.py +++ b/src/eynollah/sbb_binarize.py @@ -328,7 +328,7 @@ class SbbBinarizer: print(input_path, 'image_name') if os.path.exists(output_path): if overwrite: - self.logger.warning("will overwrite existing output file '%s'", output_ptah) + self.logger.warning("will overwrite existing output file '%s'", output_path) else: self.logger.warning("will skip input for existing output file '%s'", output_path) image = cv2.imread(input_path) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 20766a8..7be1fd0 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -146,7 +146,7 @@ def return_multicol_separators_x_start_end( args_emptysep.add(i) # print(j, i, "%d:%d" % (y_mid[j], y_mid[i]), "%d:%d" % (x_start[i], x_end[i]), "empty next sep") args_to_be_kept = [arg for arg in args_ysorted - if not arg in args_emptysep] + if arg not in args_emptysep] x_start = x_start[args_to_be_kept] x_end = x_end[args_to_be_kept] y_min = y_min[args_to_be_kept] From 028ed169212df4a1048b26d691e1edc53592f230 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 15 Nov 2025 17:17:37 +0100 Subject: [PATCH 030/118] adapt ocrd-sbb-binarize --- src/eynollah/ocrd_cli_binarization.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/eynollah/ocrd_cli_binarization.py b/src/eynollah/ocrd_cli_binarization.py index 848bbac..6289517 100644 --- a/src/eynollah/ocrd_cli_binarization.py +++ b/src/eynollah/ocrd_cli_binarization.py @@ -70,7 +70,7 @@ class SbbBinarizeProcessor(Processor): if oplevel == 'page': self.logger.info("Binarizing on 'page' level in page '%s'", page_id) - page_image_bin = cv2pil(self.binarizer.run(image=pil2cv(page_image), use_patches=True)) + page_image_bin = cv2pil(self.binarizer.run_single(image=pil2cv(page_image), use_patches=True)) # update PAGE (reference the image file): page_image_ref = AlternativeImageType(comments=page_xywh['features'] + ',binarized,clipped') page.add_AlternativeImage(page_image_ref) @@ -83,7 +83,7 @@ class SbbBinarizeProcessor(Processor): for region in regions: region_image, region_xywh = self.workspace.image_from_segment( region, page_image, page_xywh, feature_filter='binarized') - region_image_bin = cv2pil(self.binarizer.run(image=pil2cv(region_image), use_patches=True)) + region_image_bin = cv2pil(self.binarizer.run_single(image=pil2cv(region_image), use_patches=True)) # update PAGE (reference the image file): region_image_ref = AlternativeImageType(comments=region_xywh['features'] + ',binarized') region.add_AlternativeImage(region_image_ref) @@ -95,7 +95,7 @@ class SbbBinarizeProcessor(Processor): self.logger.warning("Page '%s' contains no text lines", page_id) for line in lines: line_image, line_xywh = self.workspace.image_from_segment(line, page_image, page_xywh, feature_filter='binarized') - line_image_bin = cv2pil(self.binarizer.run(image=pil2cv(line_image), use_patches=True)) + line_image_bin = cv2pil(self.binarizer.run_single(image=pil2cv(line_image), use_patches=True)) # update PAGE (reference the image file): line_image_ref = AlternativeImageType(comments=line_xywh['features'] + ',binarized') line.add_AlternativeImage(region_image_ref) From 406288b1fed020c2a68e20114ec51fe4d7f580f8 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 15 Nov 2025 20:13:58 +0100 Subject: [PATCH 031/118] fixup 72d059f3: forgot to update other writer calls --- src/eynollah/eynollah.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 6024646..46a1704 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -4164,7 +4164,7 @@ class Eynollah: image_page, page_coord, cont_page = \ self.get_regions_light_v_extract_only_images(img_res, is_image_enhanced, num_col_classifier) pcgts = self.writer.build_pagexml_no_full_layout( - [], page_coord, [], [], [], [], + [], page_coord, [], [], [], polygons_of_images, [], [], [], [], [], [], [], [], [], cont_page, [], []) if self.plotter: @@ -4282,7 +4282,7 @@ class Eynollah: self.logger.info("No columns detected - generating empty PAGE-XML") pcgts = self.writer.build_pagexml_no_full_layout( - [], page_coord, [], [], [], [], [], [], [], [], [], [], [], [], [], [], + [], page_coord, [], [], [], [], [], [], [], [], [], [], [], [], [], cont_page, [], []) return pcgts @@ -4529,7 +4529,7 @@ class Eynollah: empty_marginals = [[]] * len(polygons_of_marginals) if self.full_layout: pcgts = self.writer.build_pagexml_full_layout( - [], [], page_coord, [], [], [], [], [], [], + [], [], page_coord, [], [], [], [], [], polygons_of_images, contours_tables, [], polygons_of_marginals, polygons_of_marginals, empty_marginals, empty_marginals, @@ -4538,7 +4538,7 @@ class Eynollah: cont_page, polygons_seplines) else: pcgts = self.writer.build_pagexml_no_full_layout( - [], page_coord, [], [], [], [], + [], page_coord, [], [], [], polygons_of_images, polygons_of_marginals, polygons_of_marginals, empty_marginals, empty_marginals, From e428e7ad78629d9d4a39fa9c49f88aa4c6244139 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 16 Nov 2025 12:17:29 +0100 Subject: [PATCH 032/118] ensure separators stay within image bounds --- src/eynollah/utils/__init__.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 7be1fd0..307d8f3 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1400,6 +1400,14 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, matrix_of_seps_ch = np.append( matrix_of_seps_ch, matrix_l_n, axis=0) + # ensure no seps are out of bounds + matrix_of_seps_ch[:, 1] = np.maximum(np.minimum(matrix_of_seps_ch[:, 1], region_pre_p.shape[1]), 0) + matrix_of_seps_ch[:, 2] = np.maximum(matrix_of_seps_ch[:, 2], 0) + matrix_of_seps_ch[:, 3] = np.minimum(matrix_of_seps_ch[:, 3], region_pre_p.shape[1]) + matrix_of_seps_ch[:, 5] = np.maximum(np.minimum(matrix_of_seps_ch[:, 5], region_pre_p.shape[0]), 0) + matrix_of_seps_ch[:, 6] = np.maximum(matrix_of_seps_ch[:, 6], 0) + matrix_of_seps_ch[:, 7] = np.minimum(matrix_of_seps_ch[:, 7], region_pre_p.shape[0]) + cy_seps_splitters=cy_seps_hor[(x_min_seps_hor<=.16*region_pre_p.shape[1]) & (x_max_seps_hor>=.84*region_pre_p.shape[1])] cy_seps_splitters = np.append(cy_seps_splitters, special_separators) @@ -1621,7 +1629,7 @@ def return_boxes_of_images_by_order_of_reading_new( starting = xmin - peaks_neg_tot min_start = np.flatnonzero(starting >= 0)[-1] # last left-of ending = xmax - peaks_neg_tot - max_end = np.flatnonzero(ending < 0)[0] # first right-of + max_end = np.flatnonzero(ending <= 0)[0] # first right-of # skip elongation unless this is already a multi-column separator/heading: if not max_end - min_start > 1: continue From ee59a6809dedd175fc47a159e6a274f7f43dd534 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 28 Nov 2025 16:17:09 +0100 Subject: [PATCH 033/118] contours_in_same_horizon: fix 5d15941b --- src/eynollah/utils/contour.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/eynollah/utils/contour.py b/src/eynollah/utils/contour.py index 052688c..393acdd 100644 --- a/src/eynollah/utils/contour.py +++ b/src/eynollah/utils/contour.py @@ -20,7 +20,7 @@ def contours_in_same_horizon(cy_main_hor): by index into the array. """ sort = np.argsort(cy_main_hor) - same = np.diff(cy_main_hor[sort] <= 20) + same = np.diff(cy_main_hor[sort]) <= 20 # groups = np.split(sort, np.arange(len(cy_main_hor) - 1)[~same] + 1) same = np.flatnonzero(same) return np.stack((sort[:-1][same], sort[1:][same])).T From 38d91673b11fb6dde03b98325d2dca2ef282310a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 28 Nov 2025 16:50:08 +0100 Subject: [PATCH 034/118] combine_hor_lines_and_delete_cross_points: get external contours instead of tree without looking at the actual hierarchy (to prevent retrieving holes as separators) --- src/eynollah/utils/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 307d8f3..1934f10 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1180,7 +1180,7 @@ def combine_hor_lines_and_delete_cross_points_and_get_lines_features_back_new( #img_p_in_ver = cv2.erode(img_p_in_ver, self.kernel, iterations=2) _, thresh = cv2.threshold(img_p_in_ver, 0, 255, 0) - contours_lines_ver, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + contours_lines_ver, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) slope_lines_ver, _, x_min_main_ver, _, _, _, y_min_main_ver, y_max_main_ver, cx_main_ver = \ find_features_of_lines(contours_lines_ver) for i in range(len(x_min_main_ver)): @@ -1194,7 +1194,7 @@ def combine_hor_lines_and_delete_cross_points_and_get_lines_features_back_new( int(cx_main_ver[i])+25] = 0 _, thresh = cv2.threshold(img_in_hor, 0, 255, 0) - contours_lines_hor, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + contours_lines_hor, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) slope_lines_hor, dist_x_hor, x_min_main_hor, x_max_main_hor, cy_main_hor, _, _, _, _ = \ find_features_of_lines(contours_lines_hor) From 06cb9d1d3184ebf35d524305785fbe28b1d9c3f8 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 28 Nov 2025 17:08:39 +0100 Subject: [PATCH 035/118] combine_hor_lines_and_delete_cross_points: fix 1-off px bug when eroding the vertical separator mask (by slicing), avoid leaving 1px strips --- src/eynollah/utils/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 1934f10..345d438 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1189,7 +1189,7 @@ def combine_hor_lines_and_delete_cross_points_and_get_lines_features_back_new( int(cx_main_ver[i])-25: int(cx_main_ver[i])+25] = 0 img_p_in_ver[int(y_max_main_ver[i])-30: - int(y_max_main_ver[i]), + int(y_max_main_ver[i]+1), int(cx_main_ver[i])-25: int(cx_main_ver[i])+25] = 0 From 5c12b6a8513b202fb97e1ccb93854a906aab6677 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 28 Nov 2025 17:27:12 +0100 Subject: [PATCH 036/118] combine_hor_lines_and_delete_cross_points: simplify and rename MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - `x_width_smaller_than_acolumn_width` → `avg_col_width` - `len_lines_bigger_than_x_width_smaller_than_acolumn_width` → `nseps_wider_than_than_avg_col_width` - `img_in_hor` → `img_p_in_hor` (analogous to vertical) --- src/eynollah/utils/__init__.py | 52 +++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 16 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 345d438..0f9dcaf 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1176,7 +1176,23 @@ def order_of_regions(textline_mask, contours_main, contours_head, y_ref, x_ref): return np.array(final_indexers_sorted), np.array(final_types), np.array(final_index_type) def combine_hor_lines_and_delete_cross_points_and_get_lines_features_back_new( - img_p_in_ver, img_in_hor,num_col_classifier): + img_p_in_ver: np.ndarray, + img_p_in_hor: np.ndarray, + num_col_classifier: int, +) -> Tuple[np.ndarray, List[float]]: + """ + Given a horizontal and vertical separator mask, combine horizontal separators + (where possible) and make sure they do not cross each other. + + Arguments: + * img_p_in_ver: mask of vertical separators + * img_p_in_hor: mask of horizontal separators + * num_col_classifier: predicted (expected) number of columns + + Returns: a tuple of + * the final horizontal separators + * the y coordinates with horizontal separators spanning the full width + """ #img_p_in_ver = cv2.erode(img_p_in_ver, self.kernel, iterations=2) _, thresh = cv2.threshold(img_p_in_ver, 0, 255, 0) @@ -1192,20 +1208,26 @@ def combine_hor_lines_and_delete_cross_points_and_get_lines_features_back_new( int(y_max_main_ver[i]+1), int(cx_main_ver[i])-25: int(cx_main_ver[i])+25] = 0 + height, width = img_p_in_ver.shape - _, thresh = cv2.threshold(img_in_hor, 0, 255, 0) + _, thresh = cv2.threshold(img_p_in_hor, 0, 255, 0) contours_lines_hor, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) - slope_lines_hor, dist_x_hor, x_min_main_hor, x_max_main_hor, cy_main_hor, _, _, _, _ = \ - find_features_of_lines(contours_lines_hor) - x_width_smaller_than_acolumn_width=img_in_hor.shape[1]/float(num_col_classifier+1.) + (slope_lines_hor, + dist_x_hor, + x_min_main_hor, + x_max_main_hor, + cy_main_hor, _, + y_min_main_hor, + y_max_main_hor, + _) = find_features_of_lines(contours_lines_hor) - len_lines_bigger_than_x_width_smaller_than_acolumn_width=len( dist_x_hor[dist_x_hor>=x_width_smaller_than_acolumn_width] ) - len_lines_bigger_than_x_width_smaller_than_acolumn_width_per_column=int(len_lines_bigger_than_x_width_smaller_than_acolumn_width / - float(num_col_classifier)) - if len_lines_bigger_than_x_width_smaller_than_acolumn_width_per_column < 10: + avg_col_width = width / float(num_col_classifier + 1) + nseps_wider_than_than_avg_col_width = np.count_nonzero(dist_x_hor>=avg_col_width) + if nseps_wider_than_than_avg_col_width < 10 * num_col_classifier: args_hor=np.arange(len(slope_lines_hor)) sep_pairs=contours_in_same_horizon(cy_main_hor) + img_p_in = np.copy(img_p_in_hor) if len(sep_pairs): special_separators=[] contours_new=[] @@ -1242,21 +1264,19 @@ def combine_hor_lines_and_delete_cross_points_and_get_lines_features_back_new( # np.var( dist_x_hor[some_args] ),'jalibdiha') special_separators.append(np.mean(cy_main_hor[some_args])) else: - img_p_in=img_in_hor - special_separators=[] + img_p_in = img_p_in_hor + special_separators = [] img_p_in_ver[img_p_in_ver == 255] = 1 - sep_ver_hor = img_p_in + img_p_in_ver - sep_ver_hor_cross = (sep_ver_hor == 2) * 1 - _, thresh = cv2.threshold(sep_ver_hor_cross.astype(np.uint8), 0, 255, 0) + sep_ver_hor_cross = 255 * ((img_p_in > 0) & (img_p_in_ver > 0)) contours_cross, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) center_cross = np.array(find_center_of_contours(contours_cross), dtype=int) for cx, cy in center_cross.T: img_p_in[cy - 30: cy + 30, cx + 5: cx + 40] = 0 img_p_in[cy - 30: cy + 30, cx - 40: cx - 4] = 0 else: - img_p_in=np.copy(img_in_hor) - special_separators=[] + img_p_in = np.copy(img_p_in_hor) + special_separators = [] return img_p_in, special_separators def return_points_with_boundies(peaks_neg_fin, first_point, last_point): From a527d7a10d50ff68af888ed66aba30c53d46520b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 28 Nov 2025 17:34:11 +0100 Subject: [PATCH 037/118] combine_hor_lines_and_delete_cross_points: improve MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - avoid unnecessary `fillPoly` (we already have the mask) - do not merge hseps if vseps interfere - remove old criterion (based on total length of hseps) - create new criterion (no x overlap and x close to each other) - rename identifiers: * `sum_dis` → `sum_xspan` * `diff_max_min_uniques` → `tot_xspan` * np.std / np.mean → `dev_xspan` - remove rule cutting around the center of crossing seps (which is unnecessary and creates small isolated seps at the center, unrelated to the actual crossing points) - create rule cutting hseps by vseps _prior_ to merging --- src/eynollah/utils/__init__.py | 61 ++++++++++++++++++++++------------ 1 file changed, 40 insertions(+), 21 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 0f9dcaf..765d5b1 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1194,6 +1194,9 @@ def combine_hor_lines_and_delete_cross_points_and_get_lines_features_back_new( * the y coordinates with horizontal separators spanning the full width """ + # cut horizontal seps by vertical seps + img_p_in_hor[img_p_in_ver > 0] = 0 + #img_p_in_ver = cv2.erode(img_p_in_ver, self.kernel, iterations=2) _, thresh = cv2.threshold(img_p_in_ver, 0, 255, 0) contours_lines_ver, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) @@ -1237,24 +1240,34 @@ def combine_hor_lines_and_delete_cross_points_and_get_lines_features_back_new( some_cy=cy_main_hor[pair] some_x_min=x_min_main_hor[pair] some_x_max=x_max_main_hor[pair] + some_y_min=y_min_main_hor[pair] + some_y_max=y_max_main_hor[pair] + if np.any(img_p_in_ver[some_y_min.min(): some_y_max.max(), + some_x_max.min(): some_x_min.max()]): + # print("horizontal pair cut by vertical sep", pair, some_args, some_cy, + # "%d:%d" % (some_x_min[0], some_x_max[0]), + # "%d:%d" % (some_x_min[1], some_x_max[1])) + continue #img_in=np.zeros(separators_closeup_n[:,:,2].shape) #print(img_p_in_ver.shape[1],some_x_max-some_x_min,'xdiff') - diff_x_some=some_x_max-some_x_min - for jv in range(len(some_args)): - img_p_in=cv2.fillPoly(img_in_hor, pts=[contours_lines_hor[some_args[jv]]], color=(1,1,1)) - if any(i_diff>(img_p_in_ver.shape[1]/float(3.3)) for i_diff in diff_x_some): - img_p_in[int(np.mean(some_cy))-5: - int(np.mean(some_cy))+5, - int(np.min(some_x_min)): - int(np.max(some_x_max)) ]=1 - sum_dis=dist_x_hor[some_args].sum() - diff_max_min_uniques=np.max(x_max_main_hor[some_args])-np.min(x_min_main_hor[some_args]) + sum_xspan = dist_x_hor[some_args].sum() + tot_xspan = np.max(x_max_main_hor[some_args]) - np.min(x_min_main_hor[some_args]) + dev_xspan = np.std(dist_x_hor[some_args]) / np.mean(dist_x_hor[some_args]) + if (tot_xspan > sum_xspan and # no x overlap + sum_xspan > 0.85 * tot_xspan): # x close to each other + # print("merging horizontal pair", pair, some_args, some_cy, + # "%d:%d" % (some_x_min[0], some_x_max[0]), + # "%d:%d" % (some_x_min[1], some_x_max[1])) + img_p_in[int(np.mean(some_cy)) - 5: + int(np.mean(some_cy)) + 5, + np.min(some_x_min): + np.max(some_x_max)] = 255 - if (diff_max_min_uniques > sum_dis and - sum_dis / float(diff_max_min_uniques) > 0.85 and - diff_max_min_uniques / float(img_p_in_ver.shape[1]) > 0.85 and - np.std(dist_x_hor[some_args]) < 0.55 * np.mean(dist_x_hor[some_args])): + if (tot_xspan > sum_xspan and # no x overlap + sum_xspan > 0.85 * tot_xspan and # x close to each other + tot_xspan > 0.85 * width and # nearly full width + dev_xspan < 0.55): # similar x span # print(dist_x_hor[some_args], # dist_x_hor[some_args].sum(), # np.min(x_min_main_hor[some_args]), @@ -1263,17 +1276,23 @@ def combine_hor_lines_and_delete_cross_points_and_get_lines_features_back_new( # np.std( dist_x_hor[some_args] ), # np.var( dist_x_hor[some_args] ),'jalibdiha') special_separators.append(np.mean(cy_main_hor[some_args])) + # print("special separator for midline", special_separators[-1]) + # plt.subplot(1, 2, 1, title='original horizontal (1) / vertical (2) seps') + # plt.imshow(1 * (img_p_in_hor > 0) + 2 * (img_p_in_ver > 0)) + # plt.subplot(1, 2, 2, title='extended horizontal seps') + # plt.imshow(img_p_in) + # plt.show() else: img_p_in = img_p_in_hor special_separators = [] - img_p_in_ver[img_p_in_ver == 255] = 1 - sep_ver_hor_cross = 255 * ((img_p_in > 0) & (img_p_in_ver > 0)) - contours_cross, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - center_cross = np.array(find_center_of_contours(contours_cross), dtype=int) - for cx, cy in center_cross.T: - img_p_in[cy - 30: cy + 30, cx + 5: cx + 40] = 0 - img_p_in[cy - 30: cy + 30, cx - 40: cx - 4] = 0 + #img_p_in_ver[img_p_in_ver == 255] = 1 + # sep_ver_hor_cross = 255 * ((img_p_in > 0) & (img_p_in_ver > 0)) + # contours_cross, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + # center_cross = np.array(find_center_of_contours(contours_cross), dtype=int) + # for cx, cy in center_cross.T: + # img_p_in[cy - 30: cy + 30, cx + 5: cx + 40] = 0 + # img_p_in[cy - 30: cy + 30, cx - 40: cx - 4] = 0 else: img_p_in = np.copy(img_p_in_hor) special_separators = [] From b71bb80e3ad9afa8f94c64af9dc73ee6269c5cae Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 28 Nov 2025 17:53:27 +0100 Subject: [PATCH 038/118] return_boxes_of_images_by_order_of_reading_new: fix 4abc2ff5 (forgot to also flip `regions_with_separators` if right2left) --- src/eynollah/utils/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 765d5b1..1aecd11 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1508,7 +1508,7 @@ def return_boxes_of_images_by_order_of_reading_new( * splitter_y_new: the y coordinates separating the parts * regions_without_separators: (text) region mask with separators suppressed; (needed to find per-part columns and to combine separators if possible) - * regions_with_separators: (full) region map with separators suppressed; + * regions_with_separators: (full) region map with separators included; (needed to elongate separators if possible) * matrix_of_seps: type and coordinates of horizontal and vertical separators, as well as headings @@ -1525,6 +1525,7 @@ def return_boxes_of_images_by_order_of_reading_new( if right2left_readingorder: regions_without_separators = cv2.flip(regions_without_separators,1) + regions_with_separators = cv2.flip(regions_with_separators,1) if logger is None: logger = getLogger(__package__) logger.debug('enter return_boxes_of_images_by_order_of_reading_new') From 5abf0c1097e76a038d451a78a785f08fa4e897bf Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 28 Nov 2025 17:58:44 +0100 Subject: [PATCH 039/118] return_boxes_of_images_by_order_of_reading_new: improve - when analysing regions spanning across columns, disregard tiny regions (smaller than half the median size) - if a region spans across columns just by a tiny fraction, and therefore is not good enough for a multi-col separator, then it should also not be good enough for a multi-col box maker --- src/eynollah/utils/__init__.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 1aecd11..bf2ec15 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1720,6 +1720,7 @@ def return_boxes_of_images_by_order_of_reading_new( # analyse connected components of regions to gain additional separators # and prepare a map for cross-column boxes ccounts = np.bincount(ccomps[top: bot].flatten()) + ccounts_median = np.median(ccounts) col_ccounts = np.stack([np.bincount(ccomps[top: bot, left: right].flatten(), minlength=ccounts.size) for left, right in pairwise(peaks_neg_tot)]) @@ -1727,6 +1728,9 @@ def return_boxes_of_images_by_order_of_reading_new( for label, label_count in enumerate(ccounts): if not label: continue + # ignore small labels for the purpose of finding multicol seps + if label_count < 0.5 * ccounts_median: + continue label_left, label_top, label_width, label_height, label_area = cstats[label] # if label_count < 0.9 * label_area: # # mostly not in this part of the page @@ -1738,15 +1742,15 @@ def return_boxes_of_images_by_order_of_reading_new( label_bot = label_top + label_height label_start = np.flatnonzero(peaks_neg_tot > label_left)[0] - 1 label_end = np.flatnonzero(peaks_neg_tot >= label_right)[0] + if label_end - label_start < 2: + continue + if np.count_nonzero(col_ccounts[:, label] > 0.1 * label_count) < 2: + continue # store as dict for multi-column boxes: for start in range(label_start, label_end): labelcolmap.setdefault(start, list()).append( (label_end, label_top, label_bot, sum(col_ccounts[start: label_end, label]))) # make additional separators: - if label_end - label_start < 2: - continue - if np.count_nonzero(col_ccounts[:, label] > 0.1 * label_count) < 2: - continue x_min_hor_some = np.append(x_min_hor_some, [label_left] * 2) x_max_hor_some = np.append(x_max_hor_some, [label_right] * 2) y_min_hor_some = np.append(y_min_hor_some, [label_top - 2, label_bot]) From 84d10962f3382fd912ca5acef7fcb3d395aad41a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 28 Nov 2025 18:04:12 +0100 Subject: [PATCH 040/118] return_boxes_of_images_by_order_of_reading_new: improve - when searching for multi-col box makers, pick the right-most allowable column, not the left-most --- src/eynollah/utils/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index bf2ec15..2ebf48a 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1796,7 +1796,7 @@ def return_boxes_of_images_by_order_of_reading_new( # "box height", (y_bot - y_top), # "label height", sum(regions_without_separators[ # y_top: y_bot, peaks_neg_tot[start + 1]])) - return min((last for last, l_top, l_bot, l_count in labelcolmap.get(start, []) + return max((last for last, l_top, l_bot, l_count in labelcolmap.get(start, []) # yield the right-most column that does not cut through # any regions in this horizontal span if y_top < l_bot and y_bot > l_top From 4dd40c542b3384322febf821c0c761bc9cb4dc46 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 28 Nov 2025 18:07:15 +0100 Subject: [PATCH 041/118] find_num_col: add optional criterion - sum of vertical separators when searching for gaps between text regions, consider the vertical separator mask (if given): add the vertical sum of vertical separators to the peak scores (making column detection more robust if still slighly skewed or partially obscured by multi-column regions, but fg seps are present) --- src/eynollah/utils/__init__.py | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 2ebf48a..0f2dac3 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -241,10 +241,13 @@ def find_num_col_deskew(regions_without_separators, sigma_, multiplier=3.8): z = gaussian_filter1d(regions_without_separators_0, sigma_) return np.std(z) -def find_num_col(regions_without_separators, num_col_classifier, tables, multiplier=3.8, unbalanced=False): +def find_num_col(regions_without_separators, num_col_classifier, tables, multiplier=3.8, unbalanced=False, vertical_separators=None): if not regions_without_separators.any(): return 0, [] + if vertical_separators is None: + vertical_separators = np.zeros_like(regions_without_separators) regions_without_separators_0 = regions_without_separators.sum(axis=0) + vertical_separators_0 = vertical_separators.sum(axis=0) # fig, (ax1, ax2) = plt.subplots(2, sharex=True) # ax1.imshow(regions_without_separators, aspect="auto") # ax2.plot(regions_without_separators_0) @@ -258,13 +261,12 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl first_nonzero = first_nonzero + 50 #+ 200 last_offmargin = len(regions_without_separators_0) - 170 #370 first_offmargin = 170 #370 + x = vertical_separators_0 y = regions_without_separators_0 # [first_nonzero:last_nonzero] - y_help = np.zeros(len(y) + 20) - y_help[10 : len(y) + 10] = y - x = np.arange(len(y)) - zneg_rev = -y_help + np.max(y_help) - zneg = np.zeros(len(zneg_rev) + 20) - zneg[10 : len(zneg_rev) + 10] = zneg_rev + y_help = np.pad(y, (10, 10), constant_values=(0, 0)) + zneg_rev = y.max() - y_help + zneg = np.pad(zneg_rev, (10, 10), constant_values=(0, 0)) + x = gaussian_filter1d(x, sigma_) z = gaussian_filter1d(y, sigma_) zneg = gaussian_filter1d(zneg, sigma_) @@ -333,6 +335,7 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl #np.mean(y[peaks_neg[0]:peaks_neg[-1]])-np.std(y[peaks_neg[0]:peaks_neg[-1]])/2.0 # extra criterion: fixed multiple of lowest gap height + # print("grenze", grenze, multiplier * (5 + np.min(interest_neg))) grenze = min(grenze, multiplier * (5 + np.min(interest_neg))) # print(interest_neg,'interest_neg') @@ -341,16 +344,22 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl # print(dis_talaei,'dis_talaei') # print(peaks_neg,'peaks_neg') # fig, (ax1, ax2) = plt.subplots(2, sharex=True) - # ax1.imshow(regions_without_separators, aspect="auto") + # ax1.imshow(regions_without_separators + 5 * vertical_separators, aspect="auto") # ax2.plot(z, color='red', label='z') # ax2.plot(zneg[20:], color='blue', label='zneg') + # ax2.plot(x, color='green', label='vsep') # ax2.scatter(peaks_neg, z[peaks_neg], color='red') # ax2.scatter(peaks_neg, zneg[20:][peaks_neg], color='blue') - # ax2.axhline(min_peaks_pos, color='red', label="min_peaks_pos") - # ax2.axhline(grenze, color='blue', label="grenze") + # ax2.axhline(min_peaks_pos, color='red') + # ax2.axhline(grenze, color='blue') + # ax2.annotate("min_peaks_pos", xy=(0, min_peaks_pos), color='red') + # ax2.annotate("grenze", xy=(0, grenze), color='blue') # ax2.text(0, grenze, "grenze") + # ax2.legend() # plt.show() + # print("vsep", x[peaks_neg]) + interest_neg = interest_neg - x[peaks_neg] interest_neg_fin = interest_neg[(interest_neg < grenze)] peaks_neg_fin = peaks_neg[(interest_neg < grenze)] From 5a3de3b42db5d92e7743e49c43315d0e98e679cd Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 28 Nov 2025 18:14:24 +0100 Subject: [PATCH 042/118] column detection: improve, aided by vseps whenever possible - `find_number_of_columns_in_document`: retain vertical separators and pass to `find_num_col` for each vertical split - `return_boxes_of_images_by_order_of_reading_new`: reconstruct the vertical separators from the segmentation mask and the separator bboxes; pass it on to `find_num_col` everywhere - `return_boxes_of_images_by_order_of_reading_new`: no need to try-catch `find_num_col` anymore - `return_boxes_of_images_by_order_of_reading_new`: when a vertical split has too few columns, * do not raise but lower the threshold `multiplier` responsible for allowing gaps as column boundaries * do not pass the `num_col_classifier` (i.e. expected number of resulting columns) of the entire page to the iterative `find_num_col` for each existing column, but only the portion of that span --- src/eynollah/utils/__init__.py | 97 ++++++++++++++++++++++++---------- 1 file changed, 68 insertions(+), 29 deletions(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 0f2dac3..43d5d75 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1,4 +1,4 @@ -from typing import Tuple +from typing import List, Tuple from logging import getLogger import time import math @@ -1315,7 +1315,35 @@ def return_points_with_boundies(peaks_neg_fin, first_point, last_point): peaks_neg_tot.append(last_point) return peaks_neg_tot -def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, label_seps, contours_h=None): +def find_number_of_columns_in_document( + region_pre_p: np.ndarray, + num_col_classifier: int, + tables: bool, + label_seps: int, + contours_h: List[np.ndarray] = None, + logger=None +) -> Tuple[int, List[int], np.ndarray, List[int], np.ndarray]: + """ + Extract vertical and horizontal separators, vertical splits and horizontal column boundaries on page. + + Arguments: + * region_pre_p: segmentation map of the page + * num_col_classifier: predicted (expected) number of columns of the page + * tables: whether tables may be present + * label_seps: segmentation map class label for separators + * contours_h: polygons of potential headings (serving as additional horizontal separators) + * logger + + Returns: a tuple of + * the actual number of columns found + * the x coordinates of the column boundaries + * an array of the separators (bounding boxes and types) + * the y coordinates of the page splits + * a mask of the separators + """ + if logger is None: + logger = getLogger(__package__) + separators_closeup = 1 * (region_pre_p == label_seps) separators_closeup[0:110] = 0 separators_closeup[-150:] = 0 @@ -1483,8 +1511,11 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, num_big_parts += 1 try: num_col, peaks_neg_fin = find_num_col(regions_without_separators[top: bot], - num_col_classifier, tables, multiplier=7.0) - # print("big part %d:%d has %d columns" % (top, bot, num_col + 1), peaks_neg_fin) + num_col_classifier, tables, + vertical_separators=1 * (vertical[top: bot] > 0), + multiplier=7.0) + logger.debug("big part %d:%d has %d columns", top, bot, num_col + 1) + # print(peaks_neg_fin) except: num_col = 0 peaks_neg_fin = [] @@ -1522,7 +1553,8 @@ def return_boxes_of_images_by_order_of_reading_new( * matrix_of_seps: type and coordinates of horizontal and vertical separators, as well as headings * num_col_classifier: predicted number of columns for the entire page - * erosion_hurts: bool + * erosion_hurts: whether region masks have already been eroded + (and thus gaps can be expected to be wider) * tables: bool * right2left_readingorder: whether to invert the default left-to-right order @@ -1578,6 +1610,12 @@ def return_boxes_of_images_by_order_of_reading_new( height_tot, width_tot = regions_without_separators.shape big_part = 22 * height_tot // 100 # percent height _, ccomps, cstats, _ = cv2.connectedComponentsWithStats(regions_without_separators.astype(np.uint8)) + args_ver = matrix_of_seps_ch[:, 9] == 1 + mask_ver = np.zeros_like(regions_without_separators, dtype=bool) + for i in np.flatnonzero(args_ver): + mask_ver[matrix_of_seps_ch[i, 6]: matrix_of_seps_ch[i, 7], + matrix_of_seps_ch[i, 2]: matrix_of_seps_ch[i, 3]] = True + vertical_seps = 1 * ((regions_with_separators == 6) & mask_ver) for top, bot in pairwise(splitter_y_new): # print("%d:%d" % (top, bot), 'i') # dbg_plt([0, None, top, bot], "image cut for y split %d:%d" % (top, bot)) @@ -1589,16 +1627,13 @@ def return_boxes_of_images_by_order_of_reading_new( #if (len(matrix_new[:,9][matrix_new[:,9]==1]) > 0 and # np.max(matrix_new[:,8][matrix_new[:,9]==1]) >= # 0.1 * (np.abs(bot-top))): - try: - num_col, peaks_neg_fin = find_num_col( - regions_without_separators[top:bot], - # we do not expect to get all columns in small parts (headings etc.): - num_col_classifier if bot - top >= big_part else 1, - tables, multiplier=6. if erosion_hurts else 7., - unbalanced=True) - except: - peaks_neg_fin=[] - num_col = 0 + num_col, peaks_neg_fin = find_num_col( + regions_without_separators[top:bot], + # we do not expect to get all columns in small parts (headings etc.): + num_col_classifier if bot - top >= big_part else 1, + tables, vertical_separators=vertical_seps[top: bot], + multiplier=6. if erosion_hurts else 7., + unbalanced=True) try: if ((len(peaks_neg_fin) + 1 < num_col_classifier or num_col_classifier == 6) and @@ -1606,12 +1641,18 @@ def return_boxes_of_images_by_order_of_reading_new( bot - top >= big_part): # found too few columns here #print('burda') + logger.debug("searching for more than %d columns in big part %d:%d", + len(peaks_neg_fin) + 1, top, bot) peaks_neg_fin_org = np.copy(peaks_neg_fin) #print("peaks_neg_fin_org", peaks_neg_fin_org) - if len(peaks_neg_fin)==0: + if len(peaks_neg_fin) == 0: num_col, peaks_neg_fin = find_num_col( regions_without_separators[top:bot], - num_col_classifier, tables, multiplier=3., unbalanced=True) + num_col_classifier, tables, + vertical_separators=vertical_seps[top: bot], + # try to be less strict (lower threshold than above) + multiplier=7. if erosion_hurts else 8., + unbalanced=True) #print(peaks_neg_fin,'peaks_neg_fin') peaks_neg_fin_early = [0] + peaks_neg_fin + [width_tot-1] @@ -1625,22 +1666,19 @@ def return_boxes_of_images_by_order_of_reading_new( # plt.plot(regions_without_separators[top:bot, left:right].sum(axis=0)) # plt.title("vertical projection (sum over y)") # plt.show() - try: - _, peaks_neg_fin1 = find_num_col( - regions_without_separators[top:bot, left:right], - num_col_classifier, tables, multiplier=7.) - except: - peaks_neg_fin1 = [] - try: - _, peaks_neg_fin2 = find_num_col( - regions_without_separators[top:bot, left:right], - num_col_classifier, tables, multiplier=5.) - except: - peaks_neg_fin2 = [] + # try to get more peaks with different multipliers + num_col_expected = round((right - left) / width_tot * num_col_classifier) + args = regions_without_separators[top:bot, left:right], num_col_expected, tables + kwargs = dict(vertical_separators=vertical_seps[top: bot, left:right]) + _, peaks_neg_fin1 = find_num_col(*args, **kwargs, multiplier=7.) + _, peaks_neg_fin2 = find_num_col(*args, **kwargs, multiplier=5.) if len(peaks_neg_fin1) >= len(peaks_neg_fin2): peaks_neg_fin = peaks_neg_fin1 else: peaks_neg_fin = peaks_neg_fin2 + # print(peaks_neg_fin) + logger.debug("found %d additional column boundaries in %d:%d", + len(peaks_neg_fin), left, right) # add offset to local result peaks_neg_fin = list(np.array(peaks_neg_fin) + left) #print(peaks_neg_fin,'peaks_neg_fin') @@ -1652,6 +1690,7 @@ def return_boxes_of_images_by_order_of_reading_new( #print(peaks_neg_fin_rev,'peaks_neg_fin_rev') if len(peaks_neg_fin_rev) >= len(peaks_neg_fin_org): + #print("found more peaks than at first glance", peaks_neg_fin_rev, peaks_neg_fin_org) peaks_neg_fin = peaks_neg_fin_rev else: peaks_neg_fin = peaks_neg_fin_org From adcea47bc05ccbdfa76c6059d5f66e4610e5ae41 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 28 Nov 2025 18:23:59 +0100 Subject: [PATCH 043/118] return_boxes_of_images_by_order_of_reading_new: always erode when passing the text region mask, do not apply erosion only if there are more than 2 columns, but iff `not erosion_hurts` (consistent with `find_num_col`'s expectations and making it as easy to find the column gaps on 1 and 2-column pages as on multi-column pages) --- src/eynollah/eynollah.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 46a1704..47198cb 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -2356,7 +2356,6 @@ class Eynollah: img_only_regions_with_sep = (prediction_regions_org_y == 1).astype(np.uint8) try: img_only_regions = cv2.erode(img_only_regions_with_sep[:,:], KERNEL, iterations=20) - _, _ = find_num_col(img_only_regions, num_col_classifier, self.tables, multiplier=6.0) img = resize_image(img_org, int(img_org.shape[0]), int(img_org.shape[1]*(1.2 if is_image_enhanced else 1))) prediction_regions_org = self.do_prediction(True, img, self.models["region"]) @@ -3138,7 +3137,7 @@ class Eynollah: #print(time.time()-t_0_box,'time box in 2') self.logger.info("num_col_classifier: %s", num_col_classifier) - if num_col_classifier >= 3: + if not erosion_hurts: if np.abs(slope_deskew) < SLOPE_THRESHOLD: regions_without_separators = regions_without_separators.astype(np.uint8) regions_without_separators = cv2.erode(regions_without_separators[:, :], KERNEL, iterations=6) @@ -3289,21 +3288,16 @@ class Eynollah: if np.abs(slope_deskew) < SLOPE_THRESHOLD: num_col, _, matrix_of_seps_ch, splitter_y_new, _ = find_number_of_columns_in_document( text_regions_p, num_col_classifier, self.tables, label_seps) - - if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - num_col_d, _, matrix_of_seps_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( - text_regions_p_d, num_col_classifier, self.tables, label_seps) - - if num_col_classifier>=3: - if np.abs(slope_deskew) < SLOPE_THRESHOLD: + if not erosion_hurts: regions_without_separators = regions_without_separators.astype(np.uint8) regions_without_separators = cv2.erode(regions_without_separators[:,:], KERNEL, iterations=6) - if np.abs(slope_deskew) >= SLOPE_THRESHOLD: + else: + num_col_d, _, matrix_of_seps_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( + text_regions_p_d, num_col_classifier, self.tables, label_seps) + if not erosion_hurts: regions_without_separators_d = regions_without_separators_d.astype(np.uint8) regions_without_separators_d = cv2.erode(regions_without_separators_d[:,:], KERNEL, iterations=6) - else: - pass if np.abs(slope_deskew) < SLOPE_THRESHOLD: boxes, peaks_neg_tot_tables = return_boxes_of_images_by_order_of_reading_new( @@ -4149,6 +4143,7 @@ class Eynollah: self.run_enhancement(self.light_version) self.logger.info(f"Image: {self.image.shape[1]}x{self.image.shape[0]}, " + f"scale {self.scale_x:.1f}x{self.scale_y:.1f}, " f"{self.dpi} DPI, {num_col_classifier} columns") if is_image_enhanced: self.logger.info("Enhancement applied") @@ -4682,7 +4677,7 @@ class Eynollah: _, _, matrix_of_seps_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( text_regions_p_d, num_col_classifier, self.tables, label_seps) - if num_col_classifier >= 3: + if not erosion_hurts: if np.abs(slope_deskew) < SLOPE_THRESHOLD: regions_without_separators = regions_without_separators.astype(np.uint8) regions_without_separators = cv2.erode(regions_without_separators[:, :], KERNEL, iterations=6) From 56e73bf72f412e5fb235a1c525834130a8932880 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 28 Nov 2025 18:27:58 +0100 Subject: [PATCH 044/118] deskewing: add a 2nd stage for precision after selecting the optimum angle on the original search range, narrow down around in the vicinity with half the range (adding computational costs, but gaining precision) --- src/eynollah/utils/separate_lines.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/eynollah/utils/separate_lines.py b/src/eynollah/utils/separate_lines.py index 22ef00d..7e415b5 100644 --- a/src/eynollah/utils/separate_lines.py +++ b/src/eynollah/utils/separate_lines.py @@ -1564,6 +1564,9 @@ def return_deskew_slop(img_patch_org, sigma_des,n_tot_angles=100, angle2, var2 = get_smallest_skew(img_resized, sigma_des, angles2, map=map, logger=logger, plotter=plotter) if var2 > var: angle = angle2 + # precision stage: + angles = np.linspace(angle - 2.5, angle + 2.5, n_tot_angles // 2) + angle, _ = get_smallest_skew(img_resized, sigma_des, angles, map=map, logger=logger, plotter=plotter) return angle def get_smallest_skew(img, sigma_des, angles, logger=None, plotter=None, map=map): From 43a95842bd0e4e29337b227183a231a8cf288646 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 2 Dec 2025 16:35:32 +0100 Subject: [PATCH 045/118] writer: also ensure validity after scaling --- src/eynollah/eynollah.py | 8 ++-- src/eynollah/writer.py | 93 +++++++++++++++------------------------- 2 files changed, 39 insertions(+), 62 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 47198cb..cceab31 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -1670,10 +1670,10 @@ class Eynollah: else: box = [0, 0, self.image.shape[1], self.image.shape[0]] cropped_page, page_coord = crop_image_inside_box(box, self.image) - cont_page.append(np.array([[page_coord[2], page_coord[0]], - [page_coord[3], page_coord[0]], - [page_coord[3], page_coord[1]], - [page_coord[2], page_coord[1]]])) + cont_page.append(np.array([[[page_coord[2], page_coord[0]]], + [[page_coord[3], page_coord[0]]], + [[page_coord[3], page_coord[1]]], + [[page_coord[2], page_coord[1]]]])) return cropped_page, page_coord, cont_page def early_page_for_num_of_column_classification(self,img_bin): diff --git a/src/eynollah/writer.py b/src/eynollah/writer.py index f8aff62..2e9c895 100644 --- a/src/eynollah/writer.py +++ b/src/eynollah/writer.py @@ -3,10 +3,10 @@ from pathlib import Path import os.path import xml.etree.ElementTree as ET -from .utils.xml import create_page_xml, xml_reading_order -from .utils.counter import EynollahIdCounter +import numpy as np +from shapely import affinity, clip_by_rect -from ocrd_utils import getLogger +from ocrd_utils import getLogger, points_from_polygon from ocrd_models.ocrd_page import ( BorderType, CoordsType, @@ -19,7 +19,10 @@ from ocrd_models.ocrd_page import ( SeparatorRegionType, to_xml ) -import numpy as np + +from .utils.xml import create_page_xml, xml_reading_order +from .utils.counter import EynollahIdCounter +from .utils.contour import contour2polygon, make_valid class EynollahXmlWriter: @@ -41,20 +44,14 @@ class EynollahXmlWriter: def image_filename_stem(self): return Path(Path(self.image_filename).name).stem - def calculate_page_coords(self, cont_page): - self.logger.debug('enter calculate_page_coords') - points_page_print = "" - for _, contour in enumerate(cont_page[0]): - if len(contour) == 2: - points_page_print += str(int((contour[0]) / self.scale_x)) - points_page_print += ',' - points_page_print += str(int((contour[1]) / self.scale_y)) - else: - points_page_print += str(int((contour[0][0]) / self.scale_x)) - points_page_print += ',' - points_page_print += str(int((contour[0][1] ) / self.scale_y)) - points_page_print = points_page_print + ' ' - return points_page_print[:-1] + def calculate_points(self, contour, offset=None): + self.logger.debug('enter calculate_points') + poly = contour2polygon(contour) + if offset is not None: + poly = affinity.translate(poly, *offset) + poly = affinity.scale(poly, xfact=1 / self.scale_x, yfact=1 / self.scale_y, origin=(0, 0)) + poly = make_valid(clip_by_rect(poly, 0, 0, self.width_org, self.height_org)) + return points_from_polygon(poly.exterior.coords[:-1]) def serialize_lines_in_region(self, text_region, all_found_textline_polygons, region_idx, page_coord, all_box_coord, slopes, counter, ocr_all_textlines_textregion): self.logger.debug('enter serialize_lines_in_region') @@ -67,20 +64,12 @@ class EynollahXmlWriter: text_region.add_TextLine(textline) text_region.set_orientation(-slopes[region_idx]) region_bboxes = all_box_coord[region_idx] - points_co = '' - for point in polygon_textline: - if len(point) != 2: - point = point[0] - point_x = point[0] + page_coord[2] - point_y = point[1] + page_coord[0] - # FIXME: or actually... not self.textline_light and not self.curved_line or np.abs(slopes[region_idx]) > 45? - if not self.textline_light and not (self.curved_line and np.abs(slopes[region_idx]) <= 45): - point_x += region_bboxes[2] - point_y += region_bboxes[0] - point_x = max(0, int(point_x / self.scale_x)) - point_y = max(0, int(point_y / self.scale_y)) - points_co += str(point_x) + ',' + str(point_y) + ' ' - coords.set_points(points_co[:-1]) + offset = [page_coord[2], page_coord[0]] + # FIXME: or actually... not self.textline_light and not self.curved_line or np.abs(slopes[region_idx]) > 45? + if not self.textline_light and not (self.curved_line and np.abs(slopes[region_idx]) <= 45): + offset[0] += region_bboxes[2] + offset[1] += region_bboxes[0] + coords.set_points(self.calculate_points(polygon_textline, offset)) def write_pagexml(self, pcgts): self.logger.info("output filename: '%s'", self.output_filename) @@ -135,8 +124,13 @@ class EynollahXmlWriter: # create the file structure pcgts = self.pcgts if self.pcgts else create_page_xml(self.image_filename, self.height_org, self.width_org) page = pcgts.get_Page() - page.set_Border(BorderType(Coords=CoordsType(points=self.calculate_page_coords(cont_page)))) + if len(cont_page): + page.set_Border(BorderType(Coords=CoordsType(points=self.calculate_points(cont_page[0])))) + if skip_layout_reading_order: + offset = None + else: + offset = [page_coord[2], page_coord[0]] counter = EynollahIdCounter() if len(order_of_texts): _counter_marginals = EynollahIdCounter(region_idx=len(order_of_texts)) @@ -149,8 +143,7 @@ class EynollahXmlWriter: for mm, region_contour in enumerate(found_polygons_text_region): textregion = TextRegionType( id=counter.next_region_id, type_='paragraph', - Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord, - skip_layout_reading_order)) + Coords=CoordsType(points=self.calculate_points(region_contour, offset)) ) if conf_contours_textregions: textregion.Coords.set_conf(conf_contours_textregions[mm]) @@ -166,7 +159,7 @@ class EynollahXmlWriter: for mm, region_contour in enumerate(found_polygons_text_region_h): textregion = TextRegionType( id=counter.next_region_id, type_='heading', - Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)) + Coords=CoordsType(points=self.calculate_points(region_contour, offset)) ) if conf_contours_textregions_h: textregion.Coords.set_conf(conf_contours_textregions_h[mm]) @@ -181,7 +174,7 @@ class EynollahXmlWriter: for mm, region_contour in enumerate(found_polygons_marginals_left): marginal = TextRegionType( id=counter.next_region_id, type_='marginalia', - Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)) + Coords=CoordsType(points=self.calculate_points(region_contour, offset)) ) page.add_TextRegion(marginal) if ocr_all_textlines_marginals_left: @@ -193,7 +186,7 @@ class EynollahXmlWriter: for mm, region_contour in enumerate(found_polygons_marginals_right): marginal = TextRegionType( id=counter.next_region_id, type_='marginalia', - Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)) + Coords=CoordsType(points=self.calculate_points(region_contour, offset)) ) page.add_TextRegion(marginal) if ocr_all_textlines_marginals_right: @@ -206,7 +199,7 @@ class EynollahXmlWriter: for mm, region_contour in enumerate(found_polygons_drop_capitals): dropcapital = TextRegionType( id=counter.next_region_id, type_='drop-capital', - Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)) + Coords=CoordsType(points=self.calculate_points(region_contour, offset)) ) page.add_TextRegion(dropcapital) all_box_coord_drop = [[0, 0, 0, 0]] @@ -221,33 +214,17 @@ class EynollahXmlWriter: for region_contour in found_polygons_text_region_img: page.add_ImageRegion( ImageRegionType(id=counter.next_region_id, - Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)))) + Coords=CoordsType(points=self.calculate_points(region_contour, offset)))) for region_contour in polygons_seplines: page.add_SeparatorRegion( SeparatorRegionType(id=counter.next_region_id, - Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, [0, 0, 0, 0])))) + Coords=CoordsType(points=self.calculate_points(region_contour, None)))) for region_contour in found_polygons_tables: page.add_TableRegion( TableRegionType(id=counter.next_region_id, - Coords=CoordsType(points=self.calculate_polygon_coords(region_contour, page_coord)))) + Coords=CoordsType(points=self.calculate_points(region_contour, offset)))) return pcgts - def calculate_polygon_coords(self, contour, page_coord, skip_layout_reading_order=False): - self.logger.debug('enter calculate_polygon_coords') - coords = '' - for point in contour: - if len(point) != 2: - point = point[0] - point_x = point[0] - point_y = point[1] - if not skip_layout_reading_order: - point_x += page_coord[2] - point_y += page_coord[0] - point_x = int(point_x / self.scale_x) - point_y = int(point_y / self.scale_y) - coords += str(point_x) + ',' + str(point_y) + ' ' - return coords[:-1] - From ad8f8167c2d5bdc5c59d50a6a6eaf920b5e72c51 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 3 Dec 2025 00:58:26 +0100 Subject: [PATCH 046/118] separate_lines/_vertical: gen cv2-like contours (w/ ndim=3, as in all other places) --- src/eynollah/utils/separate_lines.py | 128 +++++++++++++-------------- 1 file changed, 64 insertions(+), 64 deletions(-) diff --git a/src/eynollah/utils/separate_lines.py b/src/eynollah/utils/separate_lines.py index 7e415b5..830dd8d 100644 --- a/src/eynollah/utils/separate_lines.py +++ b/src/eynollah/utils/separate_lines.py @@ -403,14 +403,14 @@ def separate_lines(img_patch, contour_text_interest, thetha, x_help, y_help): point_down_rot3=point_down_rot3-y_help point_down_rot4=point_down_rot4-y_help - textline_boxes_rot.append(np.array([[int(x_min_rot1), int(point_up_rot1)], - [int(x_max_rot2), int(point_up_rot2)], - [int(x_max_rot3), int(point_down_rot3)], - [int(x_min_rot4), int(point_down_rot4)]])) - textline_boxes.append(np.array([[int(x_min), int(point_up)], - [int(x_max), int(point_up)], - [int(x_max), int(point_down)], - [int(x_min), int(point_down)]])) + textline_boxes_rot.append(np.array([[[int(x_min_rot1), int(point_up_rot1)]], + [[int(x_max_rot2), int(point_up_rot2)]], + [[int(x_max_rot3), int(point_down_rot3)]], + [[int(x_min_rot4), int(point_down_rot4)]]])) + textline_boxes.append(np.array([[[int(x_min), int(point_up)]], + [[int(x_max), int(point_up)]], + [[int(x_max), int(point_down)]], + [[int(x_min), int(point_down)]]])) elif len(peaks) < 1: pass @@ -462,14 +462,14 @@ def separate_lines(img_patch, contour_text_interest, thetha, x_help, y_help): point_down_rot3=point_down_rot3-y_help point_down_rot4=point_down_rot4-y_help - textline_boxes_rot.append(np.array([[int(x_min_rot1), int(point_up_rot1)], - [int(x_max_rot2), int(point_up_rot2)], - [int(x_max_rot3), int(point_down_rot3)], - [int(x_min_rot4), int(point_down_rot4)]])) - textline_boxes.append(np.array([[int(x_min), int(y_min)], - [int(x_max), int(y_min)], - [int(x_max), int(y_max)], - [int(x_min), int(y_max)]])) + textline_boxes_rot.append(np.array([[[int(x_min_rot1), int(point_up_rot1)]], + [[int(x_max_rot2), int(point_up_rot2)]], + [[int(x_max_rot3), int(point_down_rot3)]], + [[int(x_min_rot4), int(point_down_rot4)]]])) + textline_boxes.append(np.array([[[int(x_min), int(y_min)]], + [[int(x_max), int(y_min)]], + [[int(x_max), int(y_max)]], + [[int(x_min), int(y_max)]]])) elif len(peaks) == 2: dis_to_next = np.abs(peaks[1] - peaks[0]) for jj in range(len(peaks)): @@ -530,14 +530,14 @@ def separate_lines(img_patch, contour_text_interest, thetha, x_help, y_help): point_down_rot3=point_down_rot3-y_help point_down_rot4=point_down_rot4-y_help - textline_boxes_rot.append(np.array([[int(x_min_rot1), int(point_up_rot1)], - [int(x_max_rot2), int(point_up_rot2)], - [int(x_max_rot3), int(point_down_rot3)], - [int(x_min_rot4), int(point_down_rot4)]])) - textline_boxes.append(np.array([[int(x_min), int(point_up)], - [int(x_max), int(point_up)], - [int(x_max), int(point_down)], - [int(x_min), int(point_down)]])) + textline_boxes_rot.append(np.array([[[int(x_min_rot1), int(point_up_rot1)]], + [[int(x_max_rot2), int(point_up_rot2)]], + [[int(x_max_rot3), int(point_down_rot3)]], + [[int(x_min_rot4), int(point_down_rot4)]]])) + textline_boxes.append(np.array([[[int(x_min), int(point_up)]], + [[int(x_max), int(point_up)]], + [[int(x_max), int(point_down)]], + [[int(x_min), int(point_down)]]])) else: for jj in range(len(peaks)): if jj == 0: @@ -606,14 +606,14 @@ def separate_lines(img_patch, contour_text_interest, thetha, x_help, y_help): point_down_rot3=point_down_rot3-y_help point_down_rot4=point_down_rot4-y_help - textline_boxes_rot.append(np.array([[int(x_min_rot1), int(point_up_rot1)], - [int(x_max_rot2), int(point_up_rot2)], - [int(x_max_rot3), int(point_down_rot3)], - [int(x_min_rot4), int(point_down_rot4)]])) - textline_boxes.append(np.array([[int(x_min), int(point_up)], - [int(x_max), int(point_up)], - [int(x_max), int(point_down)], - [int(x_min), int(point_down)]])) + textline_boxes_rot.append(np.array([[[int(x_min_rot1), int(point_up_rot1)]], + [[int(x_max_rot2), int(point_up_rot2)]], + [[int(x_max_rot3), int(point_down_rot3)]], + [[int(x_min_rot4), int(point_down_rot4)]]])) + textline_boxes.append(np.array([[[int(x_min), int(point_up)]], + [[int(x_max), int(point_up)]], + [[int(x_max), int(point_down)]], + [[int(x_min), int(point_down)]]])) return peaks, textline_boxes_rot def separate_lines_vertical(img_patch, contour_text_interest, thetha): @@ -785,14 +785,14 @@ def separate_lines_vertical(img_patch, contour_text_interest, thetha): if point_up_rot2 < 0: point_up_rot2 = 0 - textline_boxes_rot.append(np.array([[int(x_min_rot1), int(point_up_rot1)], - [int(x_max_rot2), int(point_up_rot2)], - [int(x_max_rot3), int(point_down_rot3)], - [int(x_min_rot4), int(point_down_rot4)]])) - textline_boxes.append(np.array([[int(x_min), int(point_up)], - [int(x_max), int(point_up)], - [int(x_max), int(point_down)], - [int(x_min), int(point_down)]])) + textline_boxes_rot.append(np.array([[[int(x_min_rot1), int(point_up_rot1)]], + [[int(x_max_rot2), int(point_up_rot2)]], + [[int(x_max_rot3), int(point_down_rot3)]], + [[int(x_min_rot4), int(point_down_rot4)]]])) + textline_boxes.append(np.array([[[int(x_min), int(point_up)]], + [[int(x_max), int(point_up)]], + [[int(x_max), int(point_down)]], + [[int(x_min), int(point_down)]]])) elif len(peaks) < 1: pass elif len(peaks) == 1: @@ -821,14 +821,14 @@ def separate_lines_vertical(img_patch, contour_text_interest, thetha): if point_up_rot2 < 0: point_up_rot2 = 0 - textline_boxes_rot.append(np.array([[int(x_min_rot1), int(point_up_rot1)], - [int(x_max_rot2), int(point_up_rot2)], - [int(x_max_rot3), int(point_down_rot3)], - [int(x_min_rot4), int(point_down_rot4)]])) - textline_boxes.append(np.array([[int(x_min), int(y_min)], - [int(x_max), int(y_min)], - [int(x_max), int(y_max)], - [int(x_min), int(y_max)]])) + textline_boxes_rot.append(np.array([[[int(x_min_rot1), int(point_up_rot1)]], + [[int(x_max_rot2), int(point_up_rot2)]], + [[int(x_max_rot3), int(point_down_rot3)]], + [[int(x_min_rot4), int(point_down_rot4)]]])) + textline_boxes.append(np.array([[[int(x_min), int(y_min)]], + [[int(x_max), int(y_min)]], + [[int(x_max), int(y_max)]], + [[int(x_min), int(y_max)]]])) elif len(peaks) == 2: dis_to_next = np.abs(peaks[1] - peaks[0]) for jj in range(len(peaks)): @@ -876,14 +876,14 @@ def separate_lines_vertical(img_patch, contour_text_interest, thetha): if point_up_rot2 < 0: point_up_rot2 = 0 - textline_boxes_rot.append(np.array([[int(x_min_rot1), int(point_up_rot1)], - [int(x_max_rot2), int(point_up_rot2)], - [int(x_max_rot3), int(point_down_rot3)], - [int(x_min_rot4), int(point_down_rot4)]])) - textline_boxes.append(np.array([[int(x_min), int(point_up)], - [int(x_max), int(point_up)], - [int(x_max), int(point_down)], - [int(x_min), int(point_down)]])) + textline_boxes_rot.append(np.array([[[int(x_min_rot1), int(point_up_rot1)]], + [[int(x_max_rot2), int(point_up_rot2)]], + [[int(x_max_rot3), int(point_down_rot3)]], + [[int(x_min_rot4), int(point_down_rot4)]]])) + textline_boxes.append(np.array([[[int(x_min), int(point_up)]], + [[int(x_max), int(point_up)]], + [[int(x_max), int(point_down)]], + [[int(x_min), int(point_down)]]])) else: for jj in range(len(peaks)): if jj == 0: @@ -942,14 +942,14 @@ def separate_lines_vertical(img_patch, contour_text_interest, thetha): if point_up_rot2 < 0: point_up_rot2 = 0 - textline_boxes_rot.append(np.array([[int(x_min_rot1), int(point_up_rot1)], - [int(x_max_rot2), int(point_up_rot2)], - [int(x_max_rot3), int(point_down_rot3)], - [int(x_min_rot4), int(point_down_rot4)]])) - textline_boxes.append(np.array([[int(x_min), int(point_up)], - [int(x_max), int(point_up)], - [int(x_max), int(point_down)], - [int(x_min), int(point_down)]])) + textline_boxes_rot.append(np.array([[[int(x_min_rot1), int(point_up_rot1)]], + [[int(x_max_rot2), int(point_up_rot2)]], + [[int(x_max_rot3), int(point_down_rot3)]], + [[int(x_min_rot4), int(point_down_rot4)]]])) + textline_boxes.append(np.array([[[int(x_min), int(point_up)]], + [[int(x_max), int(point_up)]], + [[int(x_max), int(point_down)]], + [[int(x_min), int(point_down)]]])) return peaks, textline_boxes_rot def separate_lines_new_inside_tiles2(img_patch, thetha): From 9fdae72e9620bd0ebd3bcef6bd8189fe8a003734 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 3 Dec 2025 03:04:46 +0100 Subject: [PATCH 047/118] utils_ocr.return_textline_contour: gen cv2-like contours (w/ ndim=3, as in all other places) --- src/eynollah/utils/utils_ocr.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/eynollah/utils/utils_ocr.py b/src/eynollah/utils/utils_ocr.py index 6e71b0f..fbe3611 100644 --- a/src/eynollah/utils/utils_ocr.py +++ b/src/eynollah/utils/utils_ocr.py @@ -369,8 +369,8 @@ def break_curved_line_into_small_pieces_and_then_merge(img_curved, mask_curved, return img_curved, img_bin_curved def return_textline_contour_with_added_box_coordinate(textline_contour, box_ind): - textline_contour[:,0] = textline_contour[:,0] + box_ind[2] - textline_contour[:,1] = textline_contour[:,1] + box_ind[0] + textline_contour[:,:,0] += box_ind[2] + textline_contour[:,:,1] += box_ind[0] return textline_contour From e2754da4f5f81ce34d5a21bf726741c27ac2aecf Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 20 Jan 2026 04:04:07 +0100 Subject: [PATCH 048/118] =?UTF-8?q?adapt=20to=20Numpy=201.25=20changes?= =?UTF-8?q?=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (esp. `np.array(...)` now not allowed on ragged arrays unless `dtype=object`, but then coercing sub-arrays to `object` as well) --- src/eynollah/eynollah.py | 22 +++++++++++++--------- src/eynollah/utils/__init__.py | 10 +++++++++- src/eynollah/utils/contour.py | 13 ++++++++----- 3 files changed, 30 insertions(+), 15 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index cceab31..c33b9f8 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -117,6 +117,7 @@ from .utils.marginals import get_marginals from .utils.resize import resize_image from .utils.shm import share_ndarray from .utils import ( + ensure_array, is_image_filename, boosting_headers_by_longshot_region_segmentation, crop_image_inside_box, @@ -2475,8 +2476,8 @@ class Eynollah: self, contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot): self.logger.debug("enter do_order_of_regions") - contours_only_text_parent = np.array(contours_only_text_parent) - contours_only_text_parent_h = np.array(contours_only_text_parent_h) + contours_only_text_parent = ensure_array(contours_only_text_parent) + contours_only_text_parent_h = ensure_array(contours_only_text_parent_h) boxes = np.array(boxes, dtype=int) # to be on the safe side c_boxes = np.stack((0.5 * boxes[:, 2:4].sum(axis=1), 0.5 * boxes[:, 0:2].sum(axis=1))) @@ -3987,7 +3988,7 @@ class Eynollah: def filterfun(lis): if len(lis) == 0: return [] - return list(np.array(lis)[indices]) + return list(ensure_array(lis)[indices]) return (filterfun(contours_par), filterfun(contours_textline), @@ -4378,7 +4379,8 @@ class Eynollah: areas_cnt_text = np.array([cv2.contourArea(c) for c in contours_only_text_parent]) areas_cnt_text = areas_cnt_text / float(areas_tot_text) #self.logger.info('areas_cnt_text %s', areas_cnt_text) - contours_only_text_parent = np.array(contours_only_text_parent)[areas_cnt_text > MIN_AREA_REGION] + contours_only_text_parent = ensure_array(contours_only_text_parent) + contours_only_text_parent = contours_only_text_parent[areas_cnt_text > MIN_AREA_REGION] areas_cnt_text_parent = areas_cnt_text[areas_cnt_text > MIN_AREA_REGION] index_con_parents = np.argsort(areas_cnt_text_parent) @@ -4397,12 +4399,13 @@ class Eynollah: areas_cnt_text_d = np.array([cv2.contourArea(c) for c in contours_only_text_parent_d]) areas_cnt_text_d = areas_cnt_text_d / float(areas_tot_text_d) - contours_only_text_parent_d = np.array(contours_only_text_parent_d)[areas_cnt_text_d > MIN_AREA_REGION] + contours_only_text_parent_d = ensure_array(contours_only_text_parent_d) + contours_only_text_parent_d = contours_only_text_parent_d[areas_cnt_text_d > MIN_AREA_REGION] areas_cnt_text_d = areas_cnt_text_d[areas_cnt_text_d > MIN_AREA_REGION] if len(contours_only_text_parent_d): index_con_parents_d = np.argsort(areas_cnt_text_d) - contours_only_text_parent_d = np.array(contours_only_text_parent_d)[index_con_parents_d] + contours_only_text_parent_d = contours_only_text_parent_d[index_con_parents_d] areas_cnt_text_d = areas_cnt_text_d[index_con_parents_d] centers_d = np.stack(find_center_of_contours(contours_only_text_parent_d)) # [2, N] @@ -4546,9 +4549,10 @@ class Eynollah: #print("text region early 3 in %.1fs", time.time() - t0) if self.light_version: contours_only_text_parent = dilate_textregion_contours(contours_only_text_parent) - contours_only_text_parent , contours_only_text_parent_d_ordered = self.filter_contours_inside_a_bigger_one( - contours_only_text_parent, contours_only_text_parent_d_ordered, text_only, - marginal_cnts=polygons_of_marginals) + contours_only_text_parent, contours_only_text_parent_d_ordered = \ + self.filter_contours_inside_a_bigger_one( + contours_only_text_parent, contours_only_text_parent_d_ordered, text_only, + marginal_cnts=polygons_of_marginals) #print("text region early 3.5 in %.1fs", time.time() - t0) conf_contours_textregions = get_textregion_contours_in_org_image_light( contours_only_text_parent, self.image, confidence_matrix) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 43d5d75..4e55aef 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1,4 +1,4 @@ -from typing import List, Tuple +from typing import Iterable, List, Tuple from logging import getLogger import time import math @@ -1929,3 +1929,11 @@ def is_image_filename(fname: str) -> bool: def is_xml_filename(fname: str) -> bool: return fname.lower().endswith('.xml') + +def ensure_array(obj: Iterable) -> np.ndarray: + """convert sequence to array of type `object` so items can be of heterogeneous shape + (but ensure not to convert inner arrays to `object` if len=1) + """ + if not isinstance(obj, np.ndarray): + return np.fromiter(obj, object) + return obj diff --git a/src/eynollah/utils/contour.py b/src/eynollah/utils/contour.py index 393acdd..7d01e74 100644 --- a/src/eynollah/utils/contour.py +++ b/src/eynollah/utils/contour.py @@ -12,6 +12,7 @@ from shapely import set_precision from shapely.ops import unary_union, nearest_points from .rotate import rotate_image, rotation_image_new +from . import ensure_array def contours_in_same_horizon(cy_main_hor): """ @@ -248,13 +249,15 @@ def return_contours_of_image(image): return contours, hierarchy def dilate_textline_contours(all_found_textline_polygons): - return [[polygon2contour(contour2polygon(contour, dilate=6)) - for contour in region] + return [ensure_array( + [polygon2contour(contour2polygon(contour, dilate=6)) + for contour in region]) for region in all_found_textline_polygons] -def dilate_textregion_contours(all_found_textline_polygons): - return [polygon2contour(contour2polygon(contour, dilate=6)) - for contour in all_found_textline_polygons] +def dilate_textregion_contours(all_found_textregion_polygons): + return ensure_array( + [polygon2contour(contour2polygon(contour, dilate=6)) + for contour in all_found_textregion_polygons]) def contour2polygon(contour: Union[np.ndarray, Sequence[Sequence[Sequence[Number]]]], dilate=0): polygon = Polygon([point[0] for point in contour]) From 3c3effcfda9b8d4dfd9dc8f685bb520fab1840b3 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 20 Jan 2026 04:18:55 +0100 Subject: [PATCH 049/118] =?UTF-8?q?drop=20TF1=20vernacular,=20relax=20TF/K?= =?UTF-8?q?eras=20and=20Torch=20requirements=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - do not restrict TF version, but depend on tf-keras and set `TF_USE_LEGACY_KERAS=1` to avoid Keras 3 behaviour - relax Numpy version requirement up to v2 - relax Torch version requirement - drop TF1 session management code - drop TF1 config in favour of TF2 config code for memory growth - training.*: also simplify and limit line length - training.train: always train with TensorBoard callback --- requirements-ocr.txt | 2 +- requirements.txt | 5 +- src/eynollah/eynollah.py | 12 +- src/eynollah/sbb_binarize.py | 28 +- ..._model_load_pretrained_weights_and_save.py | 8 +- src/eynollah/training/inference.py | 192 ++++------ src/eynollah/training/train.py | 333 +++++++++++------- src/eynollah/utils/contour.py | 3 +- 8 files changed, 289 insertions(+), 294 deletions(-) diff --git a/requirements-ocr.txt b/requirements-ocr.txt index 9f31ebb..8f3b062 100644 --- a/requirements-ocr.txt +++ b/requirements-ocr.txt @@ -1,2 +1,2 @@ -torch <= 2.0.1 +torch transformers <= 4.30.2 diff --git a/requirements.txt b/requirements.txt index db1d7df..5699566 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,9 @@ # ocrd includes opencv, numpy, shapely, click ocrd >= 3.3.0 -numpy <1.24.0 +numpy < 2.0 scikit-learn >= 0.23.2 -tensorflow < 2.13 +tensorflow +tf-keras # avoid keras 3 (also needs TF_USE_LEGACY_KERAS=1) numba <= 0.58.1 scikit-image biopython diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index c33b9f8..4a83c0a 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -56,14 +56,12 @@ except ImportError: TrOCRProcessor = VisionEncoderDecoderModel = None #os.environ['CUDA_VISIBLE_DEVICES'] = '-1' +os.environ['TF_USE_LEGACY_KERAS'] = '1' # avoid Keras 3 after TF 2.15 tf_disable_interactive_logs() import tensorflow as tf -from tensorflow.python.keras import backend as K from tensorflow.keras.models import load_model tf.get_logger().setLevel("ERROR") warnings.filterwarnings("ignore") -# use tf1 compatibility for keras backend -from tensorflow.compat.v1.keras.backend import set_session from tensorflow.keras import layers from tensorflow.keras.layers import StringLookup @@ -277,14 +275,6 @@ class Eynollah: t_start = time.time() - # #gpu_options = tf.compat.v1.GPUOptions(allow_growth=True) - # #gpu_options = tf.compat.v1.GPUOptions(per_process_gpu_memory_fraction=7.7, allow_growth=True) - # #session = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(gpu_options=gpu_options)) - # config = tf.compat.v1.ConfigProto() - # config.gpu_options.allow_growth = True - # #session = tf.InteractiveSession() - # session = tf.compat.v1.Session(config=config) - # set_session(session) try: for device in tf.config.list_physical_devices('GPU'): tf.config.experimental.set_memory_growth(device, True) diff --git a/src/eynollah/sbb_binarize.py b/src/eynollah/sbb_binarize.py index b81f45e..2ca4a40 100644 --- a/src/eynollah/sbb_binarize.py +++ b/src/eynollah/sbb_binarize.py @@ -2,19 +2,19 @@ Tool to load model and binarize a given image. """ -import sys from glob import glob import os import logging +from PIL import Image import numpy as np -from PIL import Image import cv2 from ocrd_utils import tf_disable_interactive_logs + +os.environ['TF_USE_LEGACY_KERAS'] = '1' # avoid Keras 3 after TF 2.15 tf_disable_interactive_logs() import tensorflow as tf from tensorflow.keras.models import load_model -from tensorflow.python.keras import backend as tensorflow_backend from .utils import is_image_filename @@ -27,26 +27,17 @@ class SbbBinarizer: self.model_dir = model_dir self.logger = logger if logger else logging.getLogger('SbbBinarizer') - self.start_new_session() - - self.model_files = glob(self.model_dir+"/*/", recursive = True) + try: + for device in tf.config.list_physical_devices('GPU'): + tf.config.experimental.set_memory_growth(device, True) + except: + self.logger.warning("no GPU device available") + self.model_files = glob(self.model_dir + "/*/", recursive=True) self.models = [] for model_file in self.model_files: self.models.append(self.load_model(model_file)) - def start_new_session(self): - config = tf.compat.v1.ConfigProto() - config.gpu_options.allow_growth = True - - self.session = tf.compat.v1.Session(config=config) # tf.InteractiveSession() - tensorflow_backend.set_session(self.session) - - def end_session(self): - tensorflow_backend.clear_session() - self.session.close() - del self.session - def load_model(self, model_name): model = load_model(os.path.join(self.model_dir, model_name), compile=False) model_height = model.layers[len(model.layers)-1].output_shape[1] @@ -55,7 +46,6 @@ class SbbBinarizer: return model, model_height, model_width, n_classes def predict(self, model_in, img, use_patches, n_batch_inference=5): - tensorflow_backend.set_session(self.session) model, model_height, model_width, n_classes = model_in img_org_h = img.shape[0] diff --git a/src/eynollah/training/build_model_load_pretrained_weights_and_save.py b/src/eynollah/training/build_model_load_pretrained_weights_and_save.py index 40fc1fe..9fba66b 100644 --- a/src/eynollah/training/build_model_load_pretrained_weights_and_save.py +++ b/src/eynollah/training/build_model_load_pretrained_weights_and_save.py @@ -1,3 +1,4 @@ +import sys import click import tensorflow as tf @@ -5,8 +6,11 @@ from .models import resnet50_unet def configuration(): - gpu_options = tf.compat.v1.GPUOptions(allow_growth=True) - session = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(gpu_options=gpu_options)) + try: + for device in tf.config.list_physical_devices('GPU'): + tf.config.experimental.set_memory_growth(device, True) + except: + print("no GPU device available", file=sys.stderr) @click.command() def build_model_load_pretrained_weights_and_save(): diff --git a/src/eynollah/training/inference.py b/src/eynollah/training/inference.py index 3fa8fd6..15d1e6a 100644 --- a/src/eynollah/training/inference.py +++ b/src/eynollah/training/inference.py @@ -1,16 +1,19 @@ +""" +Tool to load model and predict for given image. +""" + import sys import os import warnings import json +import click import numpy as np import cv2 -from tensorflow.keras.models import load_model + +os.environ['TF_USE_LEGACY_KERAS'] = '1' # avoid Keras 3 after TF 2.15 import tensorflow as tf -from tensorflow.keras import backend as K -from tensorflow.keras.layers import * -import click -from tensorflow.python.keras import backend as tensorflow_backend +from tensorflow.keras.models import load_model import xml.etree.ElementTree as ET from .gt_gen_utils import ( @@ -24,17 +27,29 @@ from .models import ( PatchEncoder, Patches ) +from .metrics import ( + soft_dice_loss, + weighted_categorical_crossentropy, +) with warnings.catch_warnings(): warnings.simplefilter("ignore") -__doc__=\ -""" -Tool to load model and predict for given image. -""" +class SBBPredict: + def __init__(self, + image, + dir_in, + model, + task, + config_params_model, + patches, + save, + save_layout, + ground_truth, + xml_file, + out, + min_area): -class sbb_predict: - def __init__(self,image, dir_in, model, task, config_params_model, patches, save, save_layout, ground_truth, xml_file, out, min_area): self.image=image self.dir_in=dir_in self.patches=patches @@ -52,8 +67,9 @@ class sbb_predict: self.min_area = 0 def resize_image(self,img_in,input_height,input_width): - return cv2.resize( img_in, ( input_width,input_height) ,interpolation=cv2.INTER_NEAREST) - + return cv2.resize(img_in, (input_width, + input_height), + interpolation=cv2.INTER_NEAREST) def color_images(self,seg): ann_u=range(self.n_classes) @@ -69,68 +85,6 @@ class sbb_predict: seg_img[:,:,2][seg==c]=c return seg_img - def otsu_copy_binary(self,img): - img_r=np.zeros((img.shape[0],img.shape[1],3)) - img1=img[:,:,0] - - #print(img.min()) - #print(img[:,:,0].min()) - #blur = cv2.GaussianBlur(img,(5,5)) - #ret3,th3 = cv2.threshold(blur,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU) - retval1, threshold1 = cv2.threshold(img1, 0, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU) - - - - img_r[:,:,0]=threshold1 - img_r[:,:,1]=threshold1 - img_r[:,:,2]=threshold1 - #img_r=img_r/float(np.max(img_r))*255 - return img_r - - def otsu_copy(self,img): - img_r=np.zeros((img.shape[0],img.shape[1],3)) - #img1=img[:,:,0] - - #print(img.min()) - #print(img[:,:,0].min()) - #blur = cv2.GaussianBlur(img,(5,5)) - #ret3,th3 = cv2.threshold(blur,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU) - _, threshold1 = cv2.threshold(img[:,:,0], 0, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU) - _, threshold2 = cv2.threshold(img[:,:,1], 0, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU) - _, threshold3 = cv2.threshold(img[:,:,2], 0, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU) - - - - img_r[:,:,0]=threshold1 - img_r[:,:,1]=threshold2 - img_r[:,:,2]=threshold3 - ###img_r=img_r/float(np.max(img_r))*255 - return img_r - - def soft_dice_loss(self,y_true, y_pred, epsilon=1e-6): - - axes = tuple(range(1, len(y_pred.shape)-1)) - - numerator = 2. * K.sum(y_pred * y_true, axes) - - denominator = K.sum(K.square(y_pred) + K.square(y_true), axes) - return 1.00 - K.mean(numerator / (denominator + epsilon)) # average over classes and batch - - def weighted_categorical_crossentropy(self,weights=None): - - def loss(y_true, y_pred): - labels_floats = tf.cast(y_true, tf.float32) - per_pixel_loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels_floats,logits=y_pred) - - if weights is not None: - weight_mask = tf.maximum(tf.reduce_max(tf.constant( - np.array(weights, dtype=np.float32)[None, None, None]) - * labels_floats, axis=-1), 1.0) - per_pixel_loss = per_pixel_loss * weight_mask[:, :, :, None] - return tf.reduce_mean(per_pixel_loss) - return self.loss - - def IoU(self,Yi,y_predi): ## mean Intersection over Union ## Mean IoU = TP/(FN + TP + FP) @@ -157,30 +111,28 @@ class sbb_predict: return mIoU def start_new_session_and_model(self): - - config = tf.compat.v1.ConfigProto() - config.gpu_options.allow_growth = True + try: + for device in tf.config.list_physical_devices('GPU'): + tf.config.experimental.set_memory_growth(device, True) + except: + print("no GPU device available", file=sys.stderr) - session = tf.compat.v1.Session(config=config) # tf.InteractiveSession() - tensorflow_backend.set_session(session) #tensorflow.keras.layers.custom_layer = PatchEncoder #tensorflow.keras.layers.custom_layer = Patches - self.model = load_model(self.model_dir , compile=False,custom_objects = {"PatchEncoder": PatchEncoder, "Patches": Patches}) - #config = tf.ConfigProto() - #config.gpu_options.allow_growth=True - - #self.session = tf.InteractiveSession() - #keras.losses.custom_loss = self.weighted_categorical_crossentropy - #self.model = load_model(self.model_dir , compile=False) + self.model = load_model(self.model_dir, compile=False, + custom_objects={"PatchEncoder": PatchEncoder, + "Patches": Patches}) + #keras.losses.custom_loss = weighted_categorical_crossentropy + #self.model = load_model(self.model_dir, compile=False) - ##if self.weights_dir!=None: ##self.model.load_weights(self.weights_dir) if self.task != 'classification' and self.task != 'reading_order': - self.img_height=self.model.layers[len(self.model.layers)-1].output_shape[1] - self.img_width=self.model.layers[len(self.model.layers)-1].output_shape[2] - self.n_classes=self.model.layers[len(self.model.layers)-1].output_shape[3] + last = self.model.layers[-1] + self.img_height = last.output_shape[1] + self.img_width = last.output_shape[2] + self.n_classes = last.output_shape[3] def visualize_model_output(self, prediction, img, task): if task == "binarization": @@ -208,21 +160,16 @@ class sbb_predict: '15' : [255, 0, 255]} layout_only = np.zeros(prediction.shape) - for unq_class in unique_classes: + where = prediction[:,:,0]==unq_class rgb_class_unique = rgb_colors[str(int(unq_class))] - layout_only[:,:,0][prediction[:,:,0]==unq_class] = rgb_class_unique[0] - layout_only[:,:,1][prediction[:,:,0]==unq_class] = rgb_class_unique[1] - layout_only[:,:,2][prediction[:,:,0]==unq_class] = rgb_class_unique[2] - - + layout_only[:,:,0][where] = rgb_class_unique[0] + layout_only[:,:,1][where] = rgb_class_unique[1] + layout_only[:,:,2][where] = rgb_class_unique[2] + layout_only = layout_only.astype(np.int32) img = self.resize_image(img, layout_only.shape[0], layout_only.shape[1]) - - layout_only = layout_only.astype(np.int32) img = img.astype(np.int32) - - added_image = cv2.addWeighted(img,0.5,layout_only,0.1,0) @@ -231,10 +178,10 @@ class sbb_predict: def predict(self, image_dir): if self.task == 'classification': classes_names = self.config_params_model['classification_classes_name'] - img_1ch = img=cv2.imread(image_dir, 0) - - img_1ch = img_1ch / 255.0 - img_1ch = cv2.resize(img_1ch, (self.config_params_model['input_height'], self.config_params_model['input_width']), interpolation=cv2.INTER_NEAREST) + img_1ch = cv2.imread(image_dir, 0) / 255.0 + img_1ch = cv2.resize(img_1ch, (self.config_params_model['input_height'], + self.config_params_model['input_width']), + interpolation=cv2.INTER_NEAREST) img_in = np.zeros((1, img_1ch.shape[0], img_1ch.shape[1], 3)) img_in[0, :, :, 0] = img_1ch[:, :] img_in[0, :, :, 1] = img_1ch[:, :] @@ -244,23 +191,27 @@ class sbb_predict: index_class = np.argmax(label_p_pred[0]) print("Predicted Class: {}".format(classes_names[str(int(index_class))])) + elif self.task == 'reading_order': img_height = self.config_params_model['input_height'] img_width = self.config_params_model['input_width'] - tree_xml, root_xml, bb_coord_printspace, file_name, id_paragraph, id_header, co_text_paragraph, co_text_header, tot_region_ref, x_len, y_len, index_tot_regions, img_poly = read_xml(self.xml_file) - _, cy_main, x_min_main, x_max_main, y_min_main, y_max_main, _ = find_new_features_of_contours(co_text_header) + tree_xml, root_xml, bb_coord_printspace, file_name, \ + id_paragraph, id_header, \ + co_text_paragraph, co_text_header, \ + tot_region_ref, x_len, y_len, index_tot_regions, \ + img_poly = read_xml(self.xml_file) + _, cy_main, x_min_main, x_max_main, y_min_main, y_max_main, _ = \ + find_new_features_of_contours(co_text_header) img_header_and_sep = np.zeros((y_len,x_len), dtype='uint8') - - for j in range(len(cy_main)): - img_header_and_sep[int(y_max_main[j]):int(y_max_main[j])+12,int(x_min_main[j]):int(x_max_main[j]) ] = 1 - + img_header_and_sep[int(y_max_main[j]): int(y_max_main[j]) + 12, + int(x_min_main[j]): int(x_max_main[j])] = 1 + co_text_all = co_text_paragraph + co_text_header id_all_text = id_paragraph + id_header - ##texts_corr_order_index = [index_tot_regions[tot_region_ref.index(i)] for i in id_all_text ] ##texts_corr_order_index_int = [int(x) for x in texts_corr_order_index] texts_corr_order_index_int = list(np.array(range(len(co_text_all)))) @@ -271,7 +222,8 @@ class sbb_predict: #print(np.shape(co_text_all[0]), len( np.shape(co_text_all[0]) ),'co_text_all') #co_text_all = filter_contours_area_of_image_tables(img_poly, co_text_all, _, max_area, min_area) #print(co_text_all,'co_text_all') - co_text_all, texts_corr_order_index_int, _ = filter_contours_area_of_image(img_poly, co_text_all, texts_corr_order_index_int, max_area, self.min_area) + co_text_all, texts_corr_order_index_int, _ = filter_contours_area_of_image( + img_poly, co_text_all, texts_corr_order_index_int, max_area, self.min_area) #print(texts_corr_order_index_int) @@ -664,17 +616,15 @@ class sbb_predict: help="min area size of regions considered for reading order detection. The default value is zero and means that all text regions are considered for reading order.", ) def main(image, dir_in, model, patches, save, save_layout, ground_truth, xml_file, out, min_area): - assert image or dir_in, "Either a single image -i or a dir_in -di is required" + assert image or dir_in, "Either a single image -i or a dir_in -di input is required" with open(os.path.join(model,'config.json')) as f: config_params_model = json.load(f) task = config_params_model['task'] if task != 'classification' and task != 'reading_order': - if image and not save: - print("Error: You used one of segmentation or binarization task with image input but not set -s, you need a filename to save visualized output with -s") - sys.exit(1) - if dir_in and not out: - print("Error: You used one of segmentation or binarization task with dir_in but not set -out") - sys.exit(1) - x=sbb_predict(image, dir_in, model, task, config_params_model, patches, save, save_layout, ground_truth, xml_file, out, min_area) + assert not image or save, "For segmentation or binarization, an input single image -i also requires an output filename -s" + assert not dir_in or out, "For segmentation or binarization, an input directory -di also requires an output directory -o" + x = SBBPredict(image, dir_in, model, task, config_params_model, + patches, save, save_layout, ground_truth, xml_file, out, + min_area) x.run() diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index 97736e0..da901b0 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -28,14 +28,14 @@ from eynollah.training.utils import ( ) os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' +os.environ['TF_USE_LEGACY_KERAS'] = '1' # avoid Keras 3 after TF 2.15 import tensorflow as tf -from tensorflow.compat.v1.keras.backend import set_session from tensorflow.keras.optimizers import SGD, Adam -from sacred import Experiment from tensorflow.keras.models import load_model +from tensorflow.keras.callbacks import Callback, TensorBoard +from sacred import Experiment from tqdm import tqdm from sklearn.metrics import f1_score -from tensorflow.keras.callbacks import Callback import numpy as np import cv2 @@ -63,10 +63,11 @@ class SaveWeightsAfterSteps(Callback): def configuration(): - config = tf.compat.v1.ConfigProto() - config.gpu_options.allow_growth = True - session = tf.compat.v1.Session(config=config) - set_session(session) + try: + for device in tf.config.list_physical_devices('GPU'): + tf.config.experimental.set_memory_growth(device, True) + except: + print("no GPU device available", file=sys.stderr) def get_dirs_or_files(input_data): @@ -171,12 +172,11 @@ def run(_config, n_classes, n_epochs, input_height, else: list_all_possible_foreground_rgbs = None - if task == "segmentation" or task == "enhancement" or task == "binarization": + if task in ["segmentation", "enhancement", "binarization"]: if data_is_provided: dir_train_flowing = os.path.join(dir_output, 'train') dir_eval_flowing = os.path.join(dir_output, 'eval') - dir_flow_train_imgs = os.path.join(dir_train_flowing, 'images') dir_flow_train_labels = os.path.join(dir_train_flowing, 'labels') @@ -227,176 +227,228 @@ def run(_config, n_classes, n_epochs, input_height, segs_list_test=np.array(os.listdir(dir_seg_val)) # writing patches into a sub-folder in order to be flowed from directory. - provide_patches(imgs_list, segs_list, dir_img, dir_seg, dir_flow_train_imgs, - dir_flow_train_labels, input_height, input_width, blur_k, - blur_aug, padding_white, padding_black, flip_aug, binarization, adding_rgb_background,adding_rgb_foreground, add_red_textlines, channels_shuffling, - scaling, shifting, degrading, brightening, scales, degrade_scales, brightness, - flip_index,shuffle_indexes, scaling_bluring, scaling_brightness, scaling_binarization, - rotation, rotation_not_90, thetha, scaling_flip, task, augmentation=augmentation, - patches=patches, dir_img_bin=dir_img_bin,number_of_backgrounds_per_image=number_of_backgrounds_per_image,list_all_possible_background_images=list_all_possible_background_images, dir_rgb_backgrounds=dir_rgb_backgrounds, dir_rgb_foregrounds=dir_rgb_foregrounds,list_all_possible_foreground_rgbs=list_all_possible_foreground_rgbs) - - provide_patches(imgs_list_test, segs_list_test, dir_img_val, dir_seg_val, - dir_flow_eval_imgs, dir_flow_eval_labels, input_height, input_width, - blur_k, blur_aug, padding_white, padding_black, flip_aug, binarization, adding_rgb_background, adding_rgb_foreground, add_red_textlines, channels_shuffling, - scaling, shifting, degrading, brightening, scales, degrade_scales, brightness, - flip_index, shuffle_indexes, scaling_bluring, scaling_brightness, scaling_binarization, - rotation, rotation_not_90, thetha, scaling_flip, task, augmentation=False, patches=patches,dir_img_bin=dir_img_bin,number_of_backgrounds_per_image=number_of_backgrounds_per_image,list_all_possible_background_images=list_all_possible_background_images, dir_rgb_backgrounds=dir_rgb_backgrounds,dir_rgb_foregrounds=dir_rgb_foregrounds,list_all_possible_foreground_rgbs=list_all_possible_foreground_rgbs ) + common_args = [input_height, input_width, + blur_k, blur_aug, + padding_white, padding_black, + flip_aug, binarization, + adding_rgb_background, + adding_rgb_foreground, + add_red_textlines, + channels_shuffling, + scaling, shifting, degrading, brightening, + scales, degrade_scales, brightness, + flip_index, shuffle_indexes, + scaling_bluring, scaling_brightness, scaling_binarization, + rotation, rotation_not_90, thetha, + scaling_flip, task, + ] + common_kwargs = dict(patches= + patches, + dir_img_bin= + dir_img_bin, + number_of_backgrounds_per_image= + number_of_backgrounds_per_image, + list_all_possible_background_images= + list_all_possible_background_images, + dir_rgb_backgrounds= + dir_rgb_backgrounds, + dir_rgb_foregrounds= + dir_rgb_foregrounds, + list_all_possible_foreground_rgbs= + list_all_possible_foreground_rgbs, + ) + provide_patches(imgs_list, segs_list, + dir_img, dir_seg, + dir_flow_train_imgs, + dir_flow_train_labels, + *common_args, + augmentation=augmentation, + **common_kwargs) + provide_patches(imgs_list_test, segs_list_test, + dir_img_val, dir_seg_val, + dir_flow_eval_imgs, + dir_flow_eval_labels, + *common_args, + augmentation=False, + **common_kwargs) if weighted_loss: weights = np.zeros(n_classes) if data_is_provided: - for obj in os.listdir(dir_flow_train_labels): - try: - label_obj = cv2.imread(dir_flow_train_labels + '/' + obj) - label_obj_one_hot = get_one_hot(label_obj, label_obj.shape[0], label_obj.shape[1], n_classes) - weights += (label_obj_one_hot.sum(axis=0)).sum(axis=0) - except: - pass + dirs = dir_flow_train_labels else: - - for obj in os.listdir(dir_seg): - try: - label_obj = cv2.imread(dir_seg + '/' + obj) - label_obj_one_hot = get_one_hot(label_obj, label_obj.shape[0], label_obj.shape[1], n_classes) - weights += (label_obj_one_hot.sum(axis=0)).sum(axis=0) - except: - pass + dirs = dir_seg + for obj in os.listdir(dirs): + label_file = os.path.join(dirs, + obj) + try: + label_obj = cv2.imread(label_file) + label_obj_one_hot = get_one_hot(label_obj, label_obj.shape[0], label_obj.shape[1], n_classes) + weights += (label_obj_one_hot.sum(axis=0)).sum(axis=0) + except Exception as e: + print("error reading data file '%s': %s" % (label_file, e), file=sys.stderr) weights = 1.00 / weights - weights = weights / float(np.sum(weights)) weights = weights / float(np.min(weights)) weights = weights / float(np.sum(weights)) if continue_training: - if backbone_type=='nontransformer': - if is_loss_soft_dice and (task == "segmentation" or task == "binarization"): - model = load_model(dir_of_start_model, compile=True, custom_objects={'soft_dice_loss': soft_dice_loss}) - if weighted_loss and (task == "segmentation" or task == "binarization"): - model = load_model(dir_of_start_model, compile=True, custom_objects={'loss': weighted_categorical_crossentropy(weights)}) - if not is_loss_soft_dice and not weighted_loss: + if backbone_type == 'nontransformer': + if is_loss_soft_dice and task in ["segmentation", "binarization"]: + model = load_model(dir_of_start_model, compile=True, + custom_objects={'soft_dice_loss': soft_dice_loss}) + elif weighted_loss and task in ["segmentation", "binarization"]: + model = load_model(dir_of_start_model, compile=True, + custom_objects={'loss': weighted_categorical_crossentropy(weights)}) + else: model = load_model(dir_of_start_model , compile=True) - elif backbone_type=='transformer': - if is_loss_soft_dice and (task == "segmentation" or task == "binarization"): - model = load_model(dir_of_start_model, compile=True, custom_objects={"PatchEncoder": PatchEncoder, "Patches": Patches,'soft_dice_loss': soft_dice_loss}) - if weighted_loss and (task == "segmentation" or task == "binarization"): - model = load_model(dir_of_start_model, compile=True, custom_objects={'loss': weighted_categorical_crossentropy(weights)}) - if not is_loss_soft_dice and not weighted_loss: - model = load_model(dir_of_start_model , compile=True,custom_objects = {"PatchEncoder": PatchEncoder, "Patches": Patches}) + + elif backbone_type == 'transformer': + if is_loss_soft_dice and task in ["segmentation", "binarization"]: + model = load_model(dir_of_start_model, compile=True, + custom_objects={"PatchEncoder": PatchEncoder, + "Patches": Patches, + 'soft_dice_loss': soft_dice_loss}) + elif weighted_loss and task in ["segmentation", "binarization"]: + model = load_model(dir_of_start_model, compile=True, + custom_objects={'loss': weighted_categorical_crossentropy(weights)}) + else: + model = load_model(dir_of_start_model, compile=True, + custom_objects = {"PatchEncoder": PatchEncoder, + "Patches": Patches}) else: index_start = 0 - if backbone_type=='nontransformer': - model = resnet50_unet(n_classes, input_height, input_width, task, weight_decay, pretraining) - elif backbone_type=='transformer': + if backbone_type == 'nontransformer': + model = resnet50_unet(n_classes, + input_height, + input_width, + task, + weight_decay, + pretraining) + elif backbone_type == 'transformer': num_patches_x = transformer_num_patches_xy[0] num_patches_y = transformer_num_patches_xy[1] num_patches = num_patches_x * num_patches_y if transformer_cnn_first: - if input_height != (num_patches_y * transformer_patchsize_y * 32): - print("Error: transformer_patchsize_y or transformer_num_patches_xy height value error . input_height should be equal to ( transformer_num_patches_xy height value * transformer_patchsize_y * 32)") - sys.exit(1) - if input_width != (num_patches_x * transformer_patchsize_x * 32): - print("Error: transformer_patchsize_x or transformer_num_patches_xy width value error . input_width should be equal to ( transformer_num_patches_xy width value * transformer_patchsize_x * 32)") - sys.exit(1) - if (transformer_projection_dim % (transformer_patchsize_y * transformer_patchsize_x)) != 0: - print("Error: transformer_projection_dim error. The remainder when parameter transformer_projection_dim is divided by (transformer_patchsize_y*transformer_patchsize_x) should be zero") - sys.exit(1) - - - model = vit_resnet50_unet(n_classes, transformer_patchsize_x, transformer_patchsize_y, num_patches, transformer_mlp_head_units, transformer_layers, transformer_num_heads, transformer_projection_dim, input_height, input_width, task, weight_decay, pretraining) + model_builder = vit_resnet50_unet + multiple_of_32 = True else: - if input_height != (num_patches_y * transformer_patchsize_y): - print("Error: transformer_patchsize_y or transformer_num_patches_xy height value error . input_height should be equal to ( transformer_num_patches_xy height value * transformer_patchsize_y)") - sys.exit(1) - if input_width != (num_patches_x * transformer_patchsize_x): - print("Error: transformer_patchsize_x or transformer_num_patches_xy width value error . input_width should be equal to ( transformer_num_patches_xy width value * transformer_patchsize_x)") - sys.exit(1) - if (transformer_projection_dim % (transformer_patchsize_y * transformer_patchsize_x)) != 0: - print("Error: transformer_projection_dim error. The remainder when parameter transformer_projection_dim is divided by (transformer_patchsize_y*transformer_patchsize_x) should be zero") - sys.exit(1) - model = vit_resnet50_unet_transformer_before_cnn(n_classes, transformer_patchsize_x, transformer_patchsize_y, num_patches, transformer_mlp_head_units, transformer_layers, transformer_num_heads, transformer_projection_dim, input_height, input_width, task, weight_decay, pretraining) + model_builder = vit_resnet50_unet_transformer_before_cnn + multiple_of_32 = False + + assert input_height == num_patches_y * transformer_patchsize_y * (32 if multiple_of_32 else 1), \ + "transformer_patchsize_y or transformer_num_patches_xy height value error: " \ + "input_height should be equal to " \ + "(transformer_num_patches_xy height value * transformer_patchsize_y%s)" % \ + " * 32" if multiple_of_32 else "" + assert input_width == num_patches_x * transformer_patchsize_x * (32 if multiple_of_32 else 1), \ + "transformer_patchsize_x or transformer_num_patches_xy width value error: " \ + "input_width should be equal to " \ + "(transformer_num_patches_xy width value * transformer_patchsize_x%s)" % \ + " * 32" if multiple_of_32 else "" + assert 0 == transformer_projection_dim % (transformer_patchsize_y * transformer_patchsize_x), \ + "transformer_projection_dim error: " \ + "The remainder when parameter transformer_projection_dim is divided by " \ + "(transformer_patchsize_y*transformer_patchsize_x) should be zero" + + model = model_builder( + n_classes, + transformer_patchsize_x, + transformer_patchsize_y, + num_patches, + transformer_mlp_head_units, + transformer_layers, + transformer_num_heads, + transformer_projection_dim, + input_height, + input_width, + task, + weight_decay, + pretraining) #if you want to see the model structure just uncomment model summary. model.summary() - - if task == "segmentation" or task == "binarization": - if not is_loss_soft_dice and not weighted_loss: - model.compile(loss='categorical_crossentropy', - optimizer=Adam(learning_rate=learning_rate), metrics=['accuracy']) + if task in ["segmentation", "binarization"]: if is_loss_soft_dice: - model.compile(loss=soft_dice_loss, - optimizer=Adam(learning_rate=learning_rate), metrics=['accuracy']) - if weighted_loss: - model.compile(loss=weighted_categorical_crossentropy(weights), - optimizer=Adam(learning_rate=learning_rate), metrics=['accuracy']) - elif task == "enhancement": - model.compile(loss='mean_squared_error', - optimizer=Adam(learning_rate=learning_rate), metrics=['accuracy']) - + loss = soft_dice_loss + elif weighted_loss: + loss = weighted_categorical_crossentropy(weights) + else: + loss = 'categorical_crossentropy' + else: # task == "enhancement" + loss = 'mean_squared_error' + model.compile(loss=loss, + optimizer=Adam(learning_rate=learning_rate), + metrics=['accuracy']) # generating train and evaluation data - train_gen = data_gen(dir_flow_train_imgs, dir_flow_train_labels, batch_size=n_batch, - input_height=input_height, input_width=input_width, n_classes=n_classes, task=task) - val_gen = data_gen(dir_flow_eval_imgs, dir_flow_eval_labels, batch_size=n_batch, - input_height=input_height, input_width=input_width, n_classes=n_classes, task=task) - + gen_kwargs = dict(batch_size=n_batch, + input_height=input_height, + input_width=input_width, + n_classes=n_classes, + task=task) + train_gen = data_gen(dir_flow_train_imgs, dir_flow_train_labels, **gen_kwargs) + val_gen = data_gen(dir_flow_eval_imgs, dir_flow_eval_labels, **gen_kwargs) + ##img_validation_patches = os.listdir(dir_flow_eval_imgs) ##score_best=[] ##score_best.append(0) + callbacks = [TensorBoard(os.path.join(dir_output, 'logs'), write_graph=False)] if save_interval: - save_weights_callback = SaveWeightsAfterSteps(save_interval, dir_output, _config) - + callbacks.append(SaveWeightsAfterSteps(save_interval, dir_output, _config)) for i in tqdm(range(index_start, n_epochs + index_start)): - if save_interval: - model.fit( - train_gen, - steps_per_epoch=int(len(os.listdir(dir_flow_train_imgs)) / n_batch) - 1, - validation_data=val_gen, - validation_steps=1, - epochs=1, callbacks=[save_weights_callback]) - else: - model.fit( - train_gen, - steps_per_epoch=int(len(os.listdir(dir_flow_train_imgs)) / n_batch) - 1, - validation_data=val_gen, - validation_steps=1, - epochs=1) - - model.save(os.path.join(dir_output,'model_'+str(i))) - - with open(os.path.join(os.path.join(dir_output,'model_'+str(i)),"config.json"), "w") as fp: + model.fit( + train_gen, + steps_per_epoch=int(len(os.listdir(dir_flow_train_imgs)) / n_batch) - 1, + validation_data=val_gen, + validation_steps=1, + epochs=1, + callbacks=callbacks) + + dir_model = os.path.join(dir_output, 'model_' + str(i)) + model.save(dir_model) + with open(os.path.join(dir_model, "config.json"), "w") as fp: json.dump(_config, fp) # encode dict into JSON #os.system('rm -rf '+dir_train_flowing) #os.system('rm -rf '+dir_eval_flowing) #model.save(dir_output+'/'+'model'+'.h5') + elif task=='classification': configuration() - model = resnet50_classifier(n_classes, input_height, input_width, weight_decay, pretraining) + model = resnet50_classifier(n_classes, + input_height, + input_width, + weight_decay, + pretraining) - opt_adam = Adam(learning_rate=0.001) model.compile(loss='categorical_crossentropy', - optimizer = opt_adam,metrics=['accuracy']) - + optimizer=Adam(learning_rate=0.001), # rs: why not learning_rate? + metrics=['accuracy']) list_classes = list(classification_classes_name.values()) - testX, testY = generate_data_from_folder_evaluation(dir_eval, input_height, input_width, n_classes, list_classes) - - y_tot=np.zeros((testX.shape[0],n_classes)) + trainXY = generate_data_from_folder_training( + dir_train, n_batch, input_height, input_width, n_classes, list_classes) + testX, testY = generate_data_from_folder_evaluation( + dir_eval, input_height, input_width, n_classes, list_classes) + y_tot = np.zeros((testX.shape[0], n_classes)) score_best= [0] - num_rows = return_number_of_total_training_data(dir_train) weights=[] + callbacks = [TensorBoard(os.path.join(dir_output, 'logs'), write_graph=False)] for i in range(n_epochs): - history = model.fit( generate_data_from_folder_training(dir_train, n_batch , input_height, input_width, n_classes, list_classes), steps_per_epoch=num_rows / n_batch, verbose=1)#,class_weight=weights) - + history = model.fit(trainXY, + steps_per_epoch=num_rows / n_batch, + #class_weight=weights) + verbose=1, + callbacks=callbacks) y_pr_class = [] for jj in range(testY.shape[0]): y_pr=model.predict(testX[jj,:,:,:].reshape(1,input_height,input_width,3), verbose=0) @@ -433,7 +485,8 @@ def run(_config, n_classes, n_epochs, input_height, elif task=='reading_order': configuration() - model = machine_based_reading_order_model(n_classes,input_height,input_width,weight_decay,pretraining) + model = machine_based_reading_order_model( + n_classes, input_height, input_width, weight_decay, pretraining) dir_flow_train_imgs = os.path.join(dir_train, 'images') dir_flow_train_labels = os.path.join(dir_train, 'labels') @@ -447,20 +500,26 @@ def run(_config, n_classes, n_epochs, input_height, #f1score_tot = [0] indexer_start = 0 - # opt = SGD(learning_rate=0.01, momentum=0.9) - opt_adam = tf.keras.optimizers.Adam(learning_rate=0.0001) model.compile(loss="binary_crossentropy", - optimizer = opt_adam,metrics=['accuracy']) + #optimizer=SGD(learning_rate=0.01, momentum=0.9), + optimizer=Adam(learning_rate=0.0001), # rs: why not learning_rate? + metrics=['accuracy']) + callbacks = [TensorBoard(os.path.join(dir_output, 'logs'), write_graph=False)] if save_interval: - save_weights_callback = SaveWeightsAfterSteps(save_interval, dir_output, _config) - + callbacks.append(SaveWeightsAfterSteps(save_interval, dir_output, _config)) + + trainXY = generate_arrays_from_folder_reading_order( + dir_flow_train_labels, dir_flow_train_imgs, + n_batch, input_height, input_width, n_classes, + thetha, augmentation) + for i in range(n_epochs): - if save_interval: - history = model.fit(generate_arrays_from_folder_reading_order(dir_flow_train_labels, dir_flow_train_imgs, n_batch, input_height, input_width, n_classes, thetha, augmentation), steps_per_epoch=num_rows / n_batch, verbose=1, callbacks=[save_weights_callback]) - else: - history = model.fit(generate_arrays_from_folder_reading_order(dir_flow_train_labels, dir_flow_train_imgs, n_batch, input_height, input_width, n_classes, thetha, augmentation), steps_per_epoch=num_rows / n_batch, verbose=1) - model.save( os.path.join(dir_output,'model_'+str(i+indexer_start) )) + history = model.fit(trainXY, + steps_per_epoch=num_rows / n_batch, + verbose=1, + callbacks=callbacks) + model.save(os.path.join(dir_output, 'model_'+str(i+indexer_start) )) with open(os.path.join(os.path.join(dir_output,'model_'+str(i)),"config.json"), "w") as fp: json.dump(_config, fp) # encode dict into JSON diff --git a/src/eynollah/utils/contour.py b/src/eynollah/utils/contour.py index 7d01e74..c8caca9 100644 --- a/src/eynollah/utils/contour.py +++ b/src/eynollah/utils/contour.py @@ -12,7 +12,6 @@ from shapely import set_precision from shapely.ops import unary_union, nearest_points from .rotate import rotate_image, rotation_image_new -from . import ensure_array def contours_in_same_horizon(cy_main_hor): """ @@ -249,12 +248,14 @@ def return_contours_of_image(image): return contours, hierarchy def dilate_textline_contours(all_found_textline_polygons): + from . import ensure_array return [ensure_array( [polygon2contour(contour2polygon(contour, dilate=6)) for contour in region]) for region in all_found_textline_polygons] def dilate_textregion_contours(all_found_textregion_polygons): + from . import ensure_array return ensure_array( [polygon2contour(contour2polygon(contour, dilate=6)) for contour in all_found_textregion_polygons]) From 87d7ffbdd84283f0e2e6dca23d4d05431cf8bb3f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 22 Jan 2026 11:25:00 +0100 Subject: [PATCH 050/118] training: use proper Keras callbacks and top-level loop --- ..._model_load_pretrained_weights_and_save.py | 10 -- src/eynollah/training/gt_gen_utils.py | 1 + src/eynollah/training/models.py | 3 + src/eynollah/training/train.py | 168 ++++++++---------- train/requirements.txt | 2 +- 5 files changed, 84 insertions(+), 100 deletions(-) diff --git a/src/eynollah/training/build_model_load_pretrained_weights_and_save.py b/src/eynollah/training/build_model_load_pretrained_weights_and_save.py index 9fba66b..15eaf64 100644 --- a/src/eynollah/training/build_model_load_pretrained_weights_and_save.py +++ b/src/eynollah/training/build_model_load_pretrained_weights_and_save.py @@ -1,17 +1,9 @@ import sys import click -import tensorflow as tf from .models import resnet50_unet -def configuration(): - try: - for device in tf.config.list_physical_devices('GPU'): - tf.config.experimental.set_memory_growth(device, True) - except: - print("no GPU device available", file=sys.stderr) - @click.command() def build_model_load_pretrained_weights_and_save(): n_classes = 2 @@ -21,8 +13,6 @@ def build_model_load_pretrained_weights_and_save(): pretraining = False dir_of_weights = 'model_bin_sbb_ens.h5' - # configuration() - model = resnet50_unet(n_classes, input_height, input_width, weight_decay, pretraining) model.load_weights(dir_of_weights) model.save('./name_in_another_python_version.h5') diff --git a/src/eynollah/training/gt_gen_utils.py b/src/eynollah/training/gt_gen_utils.py index 2e3428b..b7c35ee 100644 --- a/src/eynollah/training/gt_gen_utils.py +++ b/src/eynollah/training/gt_gen_utils.py @@ -653,6 +653,7 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ num_col = int(text_comments.split('num_col')[1]) comment_is_sub_element = True if not comment_is_sub_element: + # FIXME: look in /Page/@custom as well num_col = None if num_col: diff --git a/src/eynollah/training/models.py b/src/eynollah/training/models.py index fdc5437..3b38fe8 100644 --- a/src/eynollah/training/models.py +++ b/src/eynollah/training/models.py @@ -1,3 +1,6 @@ +import os + +os.environ['TF_USE_LEGACY_KERAS'] = '1' # avoid Keras 3 after TF 2.15 import tensorflow as tf from tensorflow import keras from tensorflow.keras.models import * diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index da901b0..7ee63f9 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -32,7 +32,7 @@ os.environ['TF_USE_LEGACY_KERAS'] = '1' # avoid Keras 3 after TF 2.15 import tensorflow as tf from tensorflow.keras.optimizers import SGD, Adam from tensorflow.keras.models import load_model -from tensorflow.keras.callbacks import Callback, TensorBoard +from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard from sacred import Experiment from tqdm import tqdm from sklearn.metrics import f1_score @@ -40,26 +40,28 @@ from sklearn.metrics import f1_score import numpy as np import cv2 -class SaveWeightsAfterSteps(Callback): - def __init__(self, save_interval, save_path, _config): - super(SaveWeightsAfterSteps, self).__init__() - self.save_interval = save_interval - self.save_path = save_path - self.step_count = 0 +class SaveWeightsAfterSteps(ModelCheckpoint): + def __init__(self, save_interval, save_path, _config, **kwargs): + if save_interval: + # batches + super().__init__( + os.path.join(save_path, "model_step_{batch:04d}"), + save_freq=save_interval, + verbose=1, + **kwargs) + else: + super().__init__( + os.path.join(save_path, "model_{epoch:02d}"), + save_freq="epoch", + verbose=1, + **kwargs) self._config = _config - def on_train_batch_end(self, batch, logs=None): - self.step_count += 1 - - if self.step_count % self.save_interval ==0: - save_file = f"{self.save_path}/model_step_{self.step_count}" - #os.system('mkdir '+save_file) - - self.model.save(save_file) - - with open(os.path.join(os.path.join(self.save_path, f"model_step_{self.step_count}"),"config.json"), "w") as fp: - json.dump(self._config, fp) # encode dict into JSON - print(f"saved model as steps {self.step_count} to {save_file}") + # overwrite tf-keras (Keras 2) implementation to get our _config JSON in + def _save_handler(self, filepath): + super()._save_handler(filepath) + with open(os.path.join(filepath, "config.json"), "w") as fp: + json.dump(self._config, fp) # encode dict into JSON def configuration(): @@ -396,23 +398,19 @@ def run(_config, n_classes, n_epochs, input_height, ##score_best=[] ##score_best.append(0) - callbacks = [TensorBoard(os.path.join(dir_output, 'logs'), write_graph=False)] + callbacks = [TensorBoard(os.path.join(dir_output, 'logs'), write_graph=False), + SaveWeightsAfterSteps(0, dir_output, _config)] if save_interval: callbacks.append(SaveWeightsAfterSteps(save_interval, dir_output, _config)) - for i in tqdm(range(index_start, n_epochs + index_start)): - model.fit( - train_gen, - steps_per_epoch=int(len(os.listdir(dir_flow_train_imgs)) / n_batch) - 1, - validation_data=val_gen, - validation_steps=1, - epochs=1, - callbacks=callbacks) - - dir_model = os.path.join(dir_output, 'model_' + str(i)) - model.save(dir_model) - with open(os.path.join(dir_model, "config.json"), "w") as fp: - json.dump(_config, fp) # encode dict into JSON + model.fit( + train_gen, + steps_per_epoch=len(os.listdir(dir_flow_train_imgs)) // n_batch - 1, + validation_data=val_gen, + #validation_steps=1, # rs: only one batch?? + validation_steps=len(os.listdir(dir_flow_eval_imgs)) // n_batch - 1, + epochs=n_epochs, + callbacks=callbacks) #os.system('rm -rf '+dir_train_flowing) #os.system('rm -rf '+dir_eval_flowing) @@ -434,54 +432,49 @@ def run(_config, n_classes, n_epochs, input_height, list_classes = list(classification_classes_name.values()) trainXY = generate_data_from_folder_training( dir_train, n_batch, input_height, input_width, n_classes, list_classes) - testX, testY = generate_data_from_folder_evaluation( + testXY = generate_data_from_folder_evaluation( dir_eval, input_height, input_width, n_classes, list_classes) y_tot = np.zeros((testX.shape[0], n_classes)) - score_best= [0] num_rows = return_number_of_total_training_data(dir_train) - weights=[] - callbacks = [TensorBoard(os.path.join(dir_output, 'logs'), write_graph=False)] + callbacks = [TensorBoard(os.path.join(dir_output, 'logs'), write_graph=False), + SaveWeightsAfterSteps(0, dir_output, _config, + monitor='val_f1', + save_best_only=True, mode='max')] - for i in range(n_epochs): - history = model.fit(trainXY, - steps_per_epoch=num_rows / n_batch, - #class_weight=weights) - verbose=1, - callbacks=callbacks) - y_pr_class = [] - for jj in range(testY.shape[0]): - y_pr=model.predict(testX[jj,:,:,:].reshape(1,input_height,input_width,3), verbose=0) - y_pr_ind= np.argmax(y_pr,axis=1) - y_pr_class.append(y_pr_ind) - - y_pr_class = np.array(y_pr_class) - f1score=f1_score(np.argmax(testY,axis=1), y_pr_class, average='macro') - print(i,f1score) - - if f1score>score_best[0]: - score_best[0]=f1score - model.save(os.path.join(dir_output,'model_best')) - - if f1score > f1_threshold_classification: - weights.append(model.get_weights() ) - + history = model.fit(trainXY, + steps_per_epoch=num_rows / n_batch, + #class_weight=weights) + validation_data=testXY, + verbose=1, + epochs=n_epochs, + metrics=[F1Score(average='macro', name='f1')], + callbacks=callbacks) - if len(weights) >= 1: - new_weights=list() - for weights_list_tuple in zip(*weights): - new_weights.append( [np.array(weights_).mean(axis=0) for weights_ in zip(*weights_list_tuple)] ) + usable_checkpoints = np.flatnonzero(np.array(history['val_f1']) > f1_threshold_classification) + if len(usable_checkpoints) >= 1: + print("averaging over usable checkpoints", usable_checkpoints) + all_weights = [] + for epoch in usable_checkpoints: + cp_path = os.path.join(dir_output, 'model_{epoch:02d}'.format(epoch=epoch)) + assert os.path.isdir(cp_path) + model = load_model(cp_path, compile=False) + all_weights.append(model.get_weights()) + + new_weights = [] + for layer_weights in zip(*all_weights): + layer_weights = np.array([np.array(weights).mean(axis=0) + for weights in zip(*layer_weights)]) + new_weights.append(layer_weights) - new_weights = [np.array(x) for x in new_weights] - model_weight_averaged=tf.keras.models.clone_model(model) - model_weight_averaged.set_weights(new_weights) - - model_weight_averaged.save(os.path.join(dir_output,'model_ens_avg')) - with open(os.path.join( os.path.join(dir_output,'model_ens_avg'), "config.json"), "w") as fp: + #model = tf.keras.models.clone_model(model) + model.set_weights(new_weights) + + cp_path = os.path.join(dir_output, 'model_ens_avg') + model.save(cp_path) + with open(os.path.join(cp_path, "config.json"), "w") as fp: json.dump(_config, fp) # encode dict into JSON - - with open(os.path.join( os.path.join(dir_output,'model_best'), "config.json"), "w") as fp: - json.dump(_config, fp) # encode dict into JSON + print("ensemble model saved under", cp_path) elif task=='reading_order': configuration() @@ -505,7 +498,8 @@ def run(_config, n_classes, n_epochs, input_height, optimizer=Adam(learning_rate=0.0001), # rs: why not learning_rate? metrics=['accuracy']) - callbacks = [TensorBoard(os.path.join(dir_output, 'logs'), write_graph=False)] + callbacks = [TensorBoard(os.path.join(dir_output, 'logs'), write_graph=False), + SaveWeightsAfterSteps(0, dir_output, _config)] if save_interval: callbacks.append(SaveWeightsAfterSteps(save_interval, dir_output, _config)) @@ -514,20 +508,16 @@ def run(_config, n_classes, n_epochs, input_height, n_batch, input_height, input_width, n_classes, thetha, augmentation) - for i in range(n_epochs): - history = model.fit(trainXY, - steps_per_epoch=num_rows / n_batch, - verbose=1, - callbacks=callbacks) - model.save(os.path.join(dir_output, 'model_'+str(i+indexer_start) )) - - with open(os.path.join(os.path.join(dir_output,'model_'+str(i)),"config.json"), "w") as fp: - json.dump(_config, fp) # encode dict into JSON - ''' - if f1score>f1score_tot[0]: - f1score_tot[0] = f1score - model_dir = os.path.join(dir_out,'model_best') - model.save(model_dir) - ''' + history = model.fit(trainXY, + steps_per_epoch=num_rows / n_batch, + verbose=1, + epochs=n_epochs, + callbacks=callbacks) + ''' + if f1score>f1score_tot[0]: + f1score_tot[0] = f1score + model_dir = os.path.join(dir_out,'model_best') + model.save(model_dir) + ''' diff --git a/train/requirements.txt b/train/requirements.txt index 63f3813..8ad884d 100644 --- a/train/requirements.txt +++ b/train/requirements.txt @@ -1,6 +1,6 @@ sacred seaborn -numpy <1.24.0 +numpy tqdm imutils scipy From 6a81db934e16971bc7edcf4b0b41a918dc444d5c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 22 Jan 2026 11:25:50 +0100 Subject: [PATCH 051/118] improve docs/train.md --- docs/train.md | 168 ++++++++++++++++++++++++++++---------------------- 1 file changed, 96 insertions(+), 72 deletions(-) diff --git a/docs/train.md b/docs/train.md index 252bead..4e76740 100644 --- a/docs/train.md +++ b/docs/train.md @@ -9,9 +9,9 @@ on how to generate the corresponding training dataset. The following three tasks can all be accomplished using the code in the [`train`](https://github.com/qurator-spk/eynollah/tree/main/train) directory: -* generate training dataset -* train a model -* inference with the trained model +* [Generate training dataset](#generate-training-dataset) +* [Train a model](#train-a-model) +* [Inference with the trained model](#inference-with-the-trained-model) ## Training, evaluation and output @@ -63,7 +63,7 @@ serve as labels. The enhancement model can be trained with this generated datase For machine-based reading order, we aim to determine the reading priority between two sets of text regions. The model's input is a three-channel image: the first and last channels contain information about each of the two text regions, while the middle channel encodes prominent layout elements necessary for reading order, such as separators and headers. -To generate the training dataset, our script requires a page XML file that specifies the image layout with the correct +To generate the training dataset, our script requires a PAGE XML file that specifies the image layout with the correct reading order. For output images, it is necessary to specify the width and height. Additionally, a minimum text region size can be set @@ -82,8 +82,14 @@ eynollah-training generate-gt machine-based-reading-order \ ### pagexml2label -pagexml2label is designed to generate labels from GT page XML files for various pixel-wise segmentation use cases, -including 'layout,' 'textline,' 'printspace,' 'glyph,' and 'word' segmentation. +`pagexml2label` is designed to generate labels from PAGE XML GT files for various pixel-wise segmentation use cases, +including: +- `printspace` (i.e. page frame), +- `layout` (i.e. regions), +- `textline`, +- `word`, and +- `glyph`. + To train a pixel-wise segmentation model, we require images along with their corresponding labels. Our training script expects a PNG image where each pixel corresponds to a label, represented by an integer. The background is always labeled as zero, while other elements are assigned different integers. For instance, if we have ground truth data with four @@ -93,7 +99,7 @@ In binary segmentation scenarios such as textline or page extraction, the backgr element is automatically encoded as 1 in the PNG label. To specify the desired use case and the elements to be extracted in the PNG labels, a custom JSON file can be passed. -For example, in the case of 'textline' detection, the JSON file would resemble this: +For example, in the case of textline detection, the JSON contents could be this: ```yaml { @@ -101,61 +107,77 @@ For example, in the case of 'textline' detection, the JSON file would resemble t } ``` -In the case of layout segmentation a custom config json file can look like this: +In the case of layout segmentation, the config JSON file might look like this: ```yaml { "use_case": "layout", -"textregions":{"rest_as_paragraph":1 , "drop-capital": 1, "header":2, "heading":2, "marginalia":3}, -"imageregion":4, -"separatorregion":5, -"graphicregions" :{"rest_as_decoration":6 ,"stamp":7} +"textregions": {"rest_as_paragraph": 1, "drop-capital": 1, "header": 2, "heading": 2, "marginalia": 3}, +"imageregion": 4, +"separatorregion": 5, +"graphicregions": {"rest_as_decoration": 6, "stamp": 7} } ``` -A possible custom config json file for layout segmentation where the "printspace" is a class: +The same example if `PrintSpace` (or `Border`) should be represented as a unique class: ```yaml { "use_case": "layout", -"textregions":{"rest_as_paragraph":1 , "drop-capital": 1, "header":2, "heading":2, "marginalia":3}, -"imageregion":4, -"separatorregion":5, -"graphicregions" :{"rest_as_decoration":6 ,"stamp":7} -"printspace_as_class_in_layout" : 8 +"textregions": {"rest_as_paragraph": 1, "drop-capital": 1, "header": 2, "heading": 2, "marginalia": 3}, +"imageregion": 4, +"separatorregion": 5, +"graphicregions": {"rest_as_decoration": 6, "stamp": 7} +"printspace_as_class_in_layout": 8 } ``` -For the layout use case, it is beneficial to first understand the structure of the page XML file and its elements. -In a given image, the annotations of elements are recorded in a page XML file, including their contours and classes. -For an image document, the known regions are 'textregion', 'separatorregion', 'imageregion', 'graphicregion', -'noiseregion', and 'tableregion'. +In the `layout` use-case, it is beneficial to first understand the structure of the PAGE XML file and its elements. +For a given page image, the visible segments are annotated in XML with their polygon coordinates and types. +On the region level, available segment types include `TextRegion`, `SeparatorRegion`, `ImageRegion`, `GraphicRegion`, +`NoiseRegion` and `TableRegion`. -Text regions and graphic regions also have their own specific types. The known types for text regions are 'paragraph', -'header', 'heading', 'marginalia', 'drop-capital', 'footnote', 'footnote-continued', 'signature-mark', 'page-number', -and 'catch-word'. The known types for graphic regions are 'handwritten-annotation', 'decoration', 'stamp', and -'signature'. -Since we don't know all types of text and graphic regions, unknown cases can arise. To handle these, we have defined -two additional types, "rest_as_paragraph" and "rest_as_decoration", to ensure that no unknown types are missed. -This way, users can extract all known types from the labels and be confident that no unknown types are overlooked. +Moreover, text regions and graphic regions in particular are subdivided via `@type`: +- The allowed subtypes for text regions are `paragraph`, `heading`, `marginalia`, `drop-capital`, `header`, `footnote`, +`footnote-continued`, `signature-mark`, `page-number` and `catch-word`. +- The known subtypes for graphic regions are `handwritten-annotation`, `decoration`, `stamp` and `signature`. -In the custom JSON file shown above, "header" and "heading" are extracted as the same class, while "marginalia" is shown -as a different class. All other text region types, including "drop-capital," are grouped into the same class. For the -graphic region, "stamp" has its own class, while all other types are classified together. "Image region" and "separator -region" are also present in the label. However, other regions like "noise region" and "table region" will not be -included in the label PNG file, even if they have information in the page XML files, as we chose not to include them. +These types and subtypes must be mapped to classes for the segmentation model. However, sometimes these fine-grained +distinctions are not useful or the existing annotations are not very usable (too scarce or too unreliable). +In that case, instead of these subtypes with a specific mapping, they can be pooled together by using the two special +types: +- `rest_as_paragraph` (mapping missing TextRegion subtypes and `paragraph`) +- `rest_as_decoration` (mapping missing GraphicRegion subtypes and `decoration`) + +(That way, users can extract all known types from the labels and be confident that no subtypes are overlooked.) + +In the custom JSON example shown above, `header` and `heading` are extracted as the same class, +while `marginalia` is modelled as a different class. All other text region types, including `drop-capital`, +are grouped into the same class. For graphic regions, `stamp` has its own class, while all other types +are classified together. `ImageRegion` and `SeparatorRegion` will also represented with a class label in the +training data. However, other regions like `NoiseRegion` or `TableRegion` will not be included in the PNG files, +even if they were present in the PAGE XML. + +The tool expects various command-line options: ```sh eynollah-training generate-gt pagexml2label \ - -dx "dir of GT xml files" \ - -do "dir where output label png files will be written" \ - -cfg "custom config json file" \ - -to "output type which has 2d and 3d. 2d is used for training and 3d is just to visualise the labels" + -dx "dir of input PAGE XML files" \ + -do "dir of output label PNG files" \ + -cfg "custom config JSON file" \ + -to "output type (2d or 3d)" ``` -We have also defined an artificial class that can be added to the boundary of text region types or text lines. This key -is called "artificial_class_on_boundary." If users want to apply this to certain text regions in the layout use case, -the example JSON config file should look like this: +As output type, use +- `2d` for training, +- `3d` to just visualise the labels. + +We have also defined an artificial class that can be added to (rendered around) the boundary +of text region types or text lines in order to make separation of neighbouring segments more +reliable. The key is called `artificial_class_on_boundary`, and it takes a list of text region +types to be applied to. + +Our example JSON config file could then look like this: ```yaml { @@ -177,14 +199,15 @@ the example JSON config file should look like this: } ``` -This implies that the artificial class label, denoted by 7, will be present on PNG files and will only be added to the -elements labeled as "paragraph," "header," "heading," and "marginalia." +This implies that the artificial class label (denoted by 7) will be present in the generated PNG files +and will only be added around segments labeled `paragraph`, `header`, `heading` or `marginalia`. (This +class will be handled specially during decoding at inference, and not show up in final results.) -For "textline", "word", and "glyph", the artificial class on the boundaries will be activated only if the -"artificial_class_label" key is specified in the config file. Its value should be set as 2 since these elements -represent binary cases. For example, if the background and textline are denoted as 0 and 1 respectively, then the -artificial class should be assigned the value 2. The example JSON config file should look like this for "textline" use -case: +For `printspace`, `textline`, `word`, and `glyph` segmentation use-cases, there is no `artificial_class_on_boundary` key, +but `artificial_class_label` is available. If specified in the config file, then its value should be set at 2, because +these elements represent binary classification problems (with background represented as 0, and segments as 1, respectively). + +For example, the JSON config for textline detection could look as follows: ```yaml { @@ -193,33 +216,33 @@ case: } ``` -If the coordinates of "PrintSpace" or "Border" are present in the page XML ground truth files, and the user wishes to -crop only the print space area, this can be achieved by activating the "-ps" argument. However, it should be noted that -in this scenario, since cropping will be applied to the label files, the directory of the original images must be -provided to ensure that they are cropped in sync with the labels. This ensures that the correct images and labels -required for training are obtained. The command should resemble the following: +If the coordinates of `PrintSpace` (or `Border`) are present in the PAGE XML ground truth files, +and one wishes to crop images to only cover the print space bounding box, this can be achieved +by passing the `-ps` option. Note that in this scenario, the directory of the original images +must also be provided, to ensure that the images are cropped in sync with the labels. The command +line would then resemble this: ```sh eynollah-training generate-gt pagexml2label \ - -dx "dir of GT xml files" \ - -do "dir where output label png files will be written" \ - -cfg "custom config json file" \ - -to "output type which has 2d and 3d. 2d is used for training and 3d is just to visualise the labels" \ + -dx "dir of input PAGE XML files" \ + -do "dir of output label PNG files" \ + -cfg "custom config JSON file" \ + -to "output type (2d or 3d)" \ -ps \ - -di "dir where the org images are located" \ - -doi "dir where the cropped output images will be written" + -di "dir of input original images" \ + -doi "dir of output cropped images" ``` ## Train a model ### classification -For the classification use case, we haven't provided a ground truth generator, as it's unnecessary. For classification, -all we require is a training directory with subdirectories, each containing images of its respective classes. We need +For the image classification use-case, we have not provided a ground truth generator, as it is unnecessary. +All we require is a training directory with subdirectories, each containing images of its respective classes. We need separate directories for training and evaluation, and the class names (subdirectories) must be consistent across both directories. Additionally, the class names should be specified in the config JSON file, as shown in the following example. If, for instance, we aim to classify "apple" and "orange," with a total of 2 classes, the -"classification_classes_name" key in the config file should appear as follows: +`classification_classes_name` key in the config file should appear as follows: ```yaml { @@ -241,7 +264,7 @@ example. If, for instance, we aim to classify "apple" and "orange," with a total } ``` -The "dir_train" should be like this: +Then `dir_train` should be like this: ``` . @@ -250,7 +273,7 @@ The "dir_train" should be like this: └── orange # directory of images for orange class ``` -And the "dir_eval" the same structure as train directory: +And `dir_eval` analogously: ``` . @@ -310,7 +333,7 @@ And the "dir_eval" the same structure as train directory: └── labels # directory of labels ``` -The classification model can be trained like the classification case command line. +The reading-order model can be trained like the classification case command line. ### Segmentation (Textline, Binarization, Page extraction and layout) and enhancement @@ -374,9 +397,9 @@ classification and machine-based reading order, as you can see in their example * `transformer_num_heads`: Transformer number of heads. Default value is 4. * `transformer_cnn_first`: We have two types of vision transformers. In one type, a CNN is applied first, followed by a transformer. In the other type, this order is reversed. If transformer_cnn_first is true, it means the CNN will be applied before the transformer. Default value is true. -In the case of segmentation and enhancement the train and evaluation directory should be as following. +In case of segmentation and enhancement the train and evaluation data should be organised as follows. -The "dir_train" should be like this: +The "dir_train" directory should be like this: ``` . @@ -394,11 +417,12 @@ And the "dir_eval" the same structure as train directory: └── labels # directory of labels ``` -After configuring the JSON file for segmentation or enhancement, training can be initiated by running the following -command, similar to the process for classification and reading order: +After configuring the JSON file for segmentation or enhancement, +training can be initiated by running the following command line, +similar to classification and reading-order model training: -``` -eynollah-training train with config_classification.json` +```sh +eynollah-training train with config_classification.json ``` #### Binarization @@ -690,7 +714,7 @@ This will straightforwardly return the class of the image. ### machine based reading order -To infer the reading order using a reading order model, we need a page XML file containing layout information but +To infer the reading order using a reading order model, we need a PAGE XML file containing layout information but without the reading order. We simply need to provide the model directory, the XML file, and the output directory. The new XML file with the added reading order will be written to the output directory with the same name. We need to run: From eb92760f73f9d8eefa9028ea697c4152d07e39ec Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 22 Jan 2026 19:49:39 +0100 Subject: [PATCH 052/118] training: download pretrained RESNET weights if missing --- src/eynollah/training/models.py | 17 ++++++++++------- src/eynollah/training/train.py | 15 ++++++++++++++- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/src/eynollah/training/models.py b/src/eynollah/training/models.py index 3b38fe8..011c614 100644 --- a/src/eynollah/training/models.py +++ b/src/eynollah/training/models.py @@ -12,7 +12,10 @@ from tensorflow.keras.regularizers import l2 ###projection_dim = 64 ##transformer_layers = 2#8 ##num_heads = 1#4 -resnet50_Weights_path = './pretrained_model/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5' +RESNET50_WEIGHTS_PATH = './pretrained_model/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5' +RESNET50_WEIGHTS_URL = ('https://github.com/fchollet/deep-learning-models/releases/download/v0.2/' + 'resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5') + IMAGE_ORDERING = 'channels_last' MERGE_AXIS = -1 @@ -242,7 +245,7 @@ def resnet50_unet_light(n_classes, input_height=224, input_width=224, taks="segm f5 = x if pretraining: - model = Model(img_input, x).load_weights(resnet50_Weights_path) + model = Model(img_input, x).load_weights(RESNET50_WEIGHTS_PATH) v512_2048 = Conv2D(512, (1, 1), padding='same', data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay))(f5) v512_2048 = (BatchNormalization(axis=bn_axis))(v512_2048) @@ -343,7 +346,7 @@ def resnet50_unet(n_classes, input_height=224, input_width=224, task="segmentati f5 = x if pretraining: - Model(img_input, x).load_weights(resnet50_Weights_path) + Model(img_input, x).load_weights(RESNET50_WEIGHTS_PATH) v1024_2048 = Conv2D(1024, (1, 1), padding='same', data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay))( f5) @@ -442,7 +445,7 @@ def vit_resnet50_unet(n_classes, patch_size_x, patch_size_y, num_patches, mlp_he f5 = x if pretraining: - model = Model(inputs, x).load_weights(resnet50_Weights_path) + model = Model(inputs, x).load_weights(RESNET50_WEIGHTS_PATH) #num_patches = x.shape[1]*x.shape[2] @@ -590,7 +593,7 @@ def vit_resnet50_unet_transformer_before_cnn(n_classes, patch_size_x, patch_size f5 = x if pretraining: - model = Model(encoded_patches, x).load_weights(resnet50_Weights_path) + model = Model(encoded_patches, x).load_weights(RESNET50_WEIGHTS_PATH) v1024_2048 = Conv2D( 1024 , (1, 1), padding='same', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay))(x) v1024_2048 = (BatchNormalization(axis=bn_axis))(v1024_2048) @@ -690,7 +693,7 @@ def resnet50_classifier(n_classes,input_height=224,input_width=224,weight_decay= f5 = x if pretraining: - Model(img_input, x).load_weights(resnet50_Weights_path) + Model(img_input, x).load_weights(RESNET50_WEIGHTS_PATH) x = AveragePooling2D((7, 7), name='avg_pool')(x) x = Flatten()(x) @@ -746,7 +749,7 @@ def machine_based_reading_order_model(n_classes,input_height=224,input_width=224 x1 = identity_block(x1, 3, [512, 512, 2048], stage=5, block='c') if pretraining: - Model(img_input , x1).load_weights(resnet50_Weights_path) + Model(img_input , x1).load_weights(RESNET50_WEIGHTS_PATH) x1 = AveragePooling2D((7, 7), name='avg_pool1')(x1) flattened = Flatten()(x1) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index 7ee63f9..6353474 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -2,6 +2,7 @@ import os import sys import json +import requests import click from eynollah.training.metrics import ( @@ -15,7 +16,9 @@ from eynollah.training.models import ( resnet50_classifier, resnet50_unet, vit_resnet50_unet, - vit_resnet50_unet_transformer_before_cnn + vit_resnet50_unet_transformer_before_cnn, + RESNET50_WEIGHTS_PATH, + RESNET50_WEIGHTS_URL ) from eynollah.training.utils import ( data_gen, @@ -80,6 +83,12 @@ def get_dirs_or_files(input_data): assert os.path.isdir(labels_input), "{} is not a directory".format(labels_input) return image_input, labels_input +def download_file(url, path): + with open(path, 'wb') as f: + with requests.get(url, stream=True) as r: + r.raise_for_status() + for data in r.iter_content(chunk_size=4096): + f.write(data) ex = Experiment(save_git_info=False) @@ -163,6 +172,10 @@ def run(_config, n_classes, n_epochs, input_height, transformer_patchsize_x, transformer_patchsize_y, transformer_num_patches_xy, backbone_type, save_interval, flip_index, dir_eval, dir_output, pretraining, learning_rate, task, f1_threshold_classification, classification_classes_name, dir_img_bin, number_of_backgrounds_per_image,dir_rgb_backgrounds, dir_rgb_foregrounds): + + if pretraining and not os.path.isfile(RESNET50_WEIGHTS_PATH): + print("downloading RESNET50 pretrained weights to", RESNET50_WEIGHTS_PATH) + download_file(RESNET50_WEIGHTS_URL, RESNET50_WEIGHTS_PATH) if dir_rgb_backgrounds: list_all_possible_background_images = os.listdir(dir_rgb_backgrounds) From acda9c84eecca75e5260b2172923f59e86838a73 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 28 Jan 2026 13:28:03 +0100 Subject: [PATCH 053/118] =?UTF-8?q?training.gt=5Fgen=5Futils:=20improve=20?= =?UTF-8?q?XML=E2=86=92img=20path=20mapping=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit when matching files in `dir_images` by XML path name stem, * use `dict` instead of `list` to assign reliably * filter out `.xml` files (so input directories can be mixed) * show informative warnings for files which cannot be matched --- src/eynollah/training/gt_gen_utils.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/src/eynollah/training/gt_gen_utils.py b/src/eynollah/training/gt_gen_utils.py index b7c35ee..f4defdd 100644 --- a/src/eynollah/training/gt_gen_utils.py +++ b/src/eynollah/training/gt_gen_utils.py @@ -627,7 +627,10 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ if dir_images: ls_org_imgs = os.listdir(dir_images) - ls_org_imgs_stem = [os.path.splitext(item)[0] for item in ls_org_imgs] + ls_org_imgs = {os.path.splitext(item)[0]: item + for item in ls_org_imgs + if not item.endswith('.xml')} + for index in tqdm(range(len(gt_list))): #try: print(gt_list[index]) @@ -802,7 +805,13 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ cv2.imwrite(os.path.join(output_dir, xml_file_stem + '.png'), img_poly) if dir_images: - org_image_name = ls_org_imgs[ls_org_imgs_stem.index(xml_file_stem)] + org_image_name = ls_org_imgs[xml_file_stem] + if not org_image_name: + print("image file for XML stem", xml_file_stem, "is missing") + continue + if not os.path.isfile(os.path.join(dir_images, org_image_name)): + print("image file for XML stem", xml_file_stem, "is not readable") + continue img_org = cv2.imread(os.path.join(dir_images, org_image_name)) if printspace and config_params['use_case']!='printspace': @@ -1266,7 +1275,13 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ if dir_images: - org_image_name = ls_org_imgs[ls_org_imgs_stem.index(xml_file_stem)] + org_image_name = ls_org_imgs[xml_file_stem] + if not org_image_name: + print("image file for XML stem", xml_file_stem, "is missing") + continue + if not os.path.isfile(os.path.join(dir_images, org_image_name)): + print("image file for XML stem", xml_file_stem, "is not readable") + continue img_org = cv2.imread(os.path.join(dir_images, org_image_name)) if printspace: From 0372fd7a1ec2e4d654c0f24171c9b30c77a3e09b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 28 Jan 2026 13:42:59 +0100 Subject: [PATCH 054/118] =?UTF-8?q?training.gt=5Fgen=5Futils:=20fix+simpli?= =?UTF-8?q?fy=20cropping=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit when parsing `PrintSpace` or `Border` from PAGE-XML, - use `lxml` XPath instead of nested loops - convert points to polygons directly (instead of painting on canvas and retrieving contours) - pass result bbox in slice notation (instead of xywh) --- src/eynollah/training/gt_gen_utils.py | 151 ++++++++------------------ src/eynollah/training/inference.py | 18 ++- 2 files changed, 51 insertions(+), 118 deletions(-) diff --git a/src/eynollah/training/gt_gen_utils.py b/src/eynollah/training/gt_gen_utils.py index f4defdd..f068afd 100644 --- a/src/eynollah/training/gt_gen_utils.py +++ b/src/eynollah/training/gt_gen_utils.py @@ -1,15 +1,18 @@ import os import numpy as np import warnings -import xml.etree.ElementTree as ET +from lxml import etree as ET from tqdm import tqdm import cv2 from shapely import geometry from pathlib import Path from PIL import ImageFont +from ocrd_utils import bbox_from_points KERNEL = np.ones((5, 5), np.uint8) +NS = { 'pc': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15' +} with warnings.catch_warnings(): warnings.simplefilter("ignore") @@ -664,52 +667,13 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ y_new = int ( x_new * (y_len / float(x_len)) ) if printspace or "printspace_as_class_in_layout" in list(config_params.keys()): - region_tags = np.unique([x for x in alltags if x.endswith('PrintSpace') or x.endswith('Border')]) - co_use_case = [] - - for tag in region_tags: - tag_endings = ['}PrintSpace','}Border'] - - if tag.endswith(tag_endings[0]) or tag.endswith(tag_endings[1]): - for nn in root1.iter(tag): - c_t_in = [] - sumi = 0 - for vv in nn.iter(): - # check the format of coords - if vv.tag == link + 'Coords': - coords = bool(vv.attrib) - if coords: - p_h = vv.attrib['points'].split(' ') - c_t_in.append( - np.array([[int(x.split(',')[0]), int(x.split(',')[1])] for x in p_h])) - break - else: - pass - - if vv.tag == link + 'Point': - c_t_in.append([int(float(vv.attrib['x'])), int(float(vv.attrib['y']))]) - sumi += 1 - elif vv.tag != link + 'Point' and sumi >= 1: - break - co_use_case.append(np.array(c_t_in)) - - img = np.zeros((y_len, x_len, 3)) - - img_poly = cv2.fillPoly(img, pts=co_use_case, color=(1, 1, 1)) - - img_poly = img_poly.astype(np.uint8) - - imgray = cv2.cvtColor(img_poly, cv2.COLOR_BGR2GRAY) - _, thresh = cv2.threshold(imgray, 0, 255, 0) - - contours, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - - cnt_size = np.array([cv2.contourArea(contours[j]) for j in range(len(contours))]) - - cnt = contours[np.argmax(cnt_size)] - - x, y, w, h = cv2.boundingRect(cnt) - bb_xywh = [x, y, w, h] + ps = (root1.xpath('/pc:PcGts/pc:Page/pc:Border', namespaces=NS) + + root1.xpath('/pc:PcGts/pc:Page/pc:PrintSpace', namespaces=NS)) + if len(ps): + points = ps[0].find('pc:Coords', NS).get('points') + ps_bbox = bbox_from_points(points) + else: + ps_bbox = [0, 0, None, None] if config_file and (config_params['use_case']=='textline' or config_params['use_case']=='word' or config_params['use_case']=='glyph' or config_params['use_case']=='printspace'): @@ -791,7 +755,8 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ if printspace and config_params['use_case']!='printspace': - img_poly = img_poly[bb_xywh[1]:bb_xywh[1]+bb_xywh[3], bb_xywh[0]:bb_xywh[0]+bb_xywh[2], :] + img_poly = img_poly[ps_bbox[1]:ps_bbox[3], + ps_bbox[0]:ps_bbox[2], :] if 'columns_width' in list(config_params.keys()) and num_col and config_params['use_case']!='printspace': @@ -815,7 +780,8 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ img_org = cv2.imread(os.path.join(dir_images, org_image_name)) if printspace and config_params['use_case']!='printspace': - img_org = img_org[bb_xywh[1]:bb_xywh[1]+bb_xywh[3], bb_xywh[0]:bb_xywh[0]+bb_xywh[2], :] + img_org = img_org[ps_bbox[1]:ps_bbox[3], + ps_bbox[0]:ps_bbox[2], :] if 'columns_width' in list(config_params.keys()) and num_col and config_params['use_case']!='printspace': img_org = resize_image(img_org, y_new, x_new) @@ -1194,7 +1160,8 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ if "printspace_as_class_in_layout" in list(config_params.keys()): printspace_mask = np.zeros((img_poly.shape[0], img_poly.shape[1])) - printspace_mask[bb_xywh[1]:bb_xywh[1]+bb_xywh[3], bb_xywh[0]:bb_xywh[0]+bb_xywh[2]] = 1 + printspace_mask[ps_bbox[1]:ps_bbox[3], + ps_bbox[0]:ps_bbox[2]] = 1 img_poly[:,:,0][printspace_mask[:,:] == 0] = printspace_class_rgb_color[0] img_poly[:,:,1][printspace_mask[:,:] == 0] = printspace_class_rgb_color[1] @@ -1252,7 +1219,8 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ if "printspace_as_class_in_layout" in list(config_params.keys()): printspace_mask = np.zeros((img_poly.shape[0], img_poly.shape[1])) - printspace_mask[bb_xywh[1]:bb_xywh[1]+bb_xywh[3], bb_xywh[0]:bb_xywh[0]+bb_xywh[2]] = 1 + printspace_mask[ps_bbox[1]:ps_bbox[3], + ps_bbox[0]:ps_bbox[2]] = 1 img_poly[:,:,0][printspace_mask[:,:] == 0] = printspace_class_label img_poly[:,:,1][printspace_mask[:,:] == 0] = printspace_class_label @@ -1261,7 +1229,8 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ if printspace: - img_poly = img_poly[bb_xywh[1]:bb_xywh[1]+bb_xywh[3], bb_xywh[0]:bb_xywh[0]+bb_xywh[2], :] + img_poly = img_poly[ps_bbox[1]:ps_bbox[3], + ps_bbox[0]:ps_bbox[2], :] if 'columns_width' in list(config_params.keys()) and num_col: img_poly = resize_image(img_poly, y_new, x_new) @@ -1285,7 +1254,8 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ img_org = cv2.imread(os.path.join(dir_images, org_image_name)) if printspace: - img_org = img_org[bb_xywh[1]:bb_xywh[1]+bb_xywh[3], bb_xywh[0]:bb_xywh[0]+bb_xywh[2], :] + img_org = img_org[ps_bbox[1]:ps_bbox[3], + ps_bbox[0]:ps_bbox[2], :] if 'columns_width' in list(config_params.keys()) and num_col: img_org = resize_image(img_org, y_new, x_new) @@ -1326,6 +1296,7 @@ def find_new_features_of_contours(contours_main): y_max_main = np.array([np.max(contours_main[j][:, 1]) for j in range(len(contours_main))]) return cx_main, cy_main, x_min_main, x_max_main, y_min_main, y_max_main, y_corr_x_min_from_argmin + def read_xml(xml_file): file_name = Path(xml_file).stem tree1 = ET.parse(xml_file, parser = ET.XMLParser(encoding='utf-8')) @@ -1344,57 +1315,13 @@ def read_xml(xml_file): index_tot_regions.append(jj.attrib['index']) tot_region_ref.append(jj.attrib['regionRef']) - if (link+'PrintSpace' in alltags) or (link+'Border' in alltags): - co_printspace = [] - if link+'PrintSpace' in alltags: - region_tags_printspace = np.unique([x for x in alltags if x.endswith('PrintSpace')]) - elif link+'Border' in alltags: - region_tags_printspace = np.unique([x for x in alltags if x.endswith('Border')]) - - for tag in region_tags_printspace: - if link+'PrintSpace' in alltags: - tag_endings_printspace = ['}PrintSpace','}printspace'] - elif link+'Border' in alltags: - tag_endings_printspace = ['}Border','}border'] - - if tag.endswith(tag_endings_printspace[0]) or tag.endswith(tag_endings_printspace[1]): - for nn in root1.iter(tag): - c_t_in = [] - sumi = 0 - for vv in nn.iter(): - # check the format of coords - if vv.tag == link + 'Coords': - coords = bool(vv.attrib) - if coords: - p_h = vv.attrib['points'].split(' ') - c_t_in.append( - np.array([[int(x.split(',')[0]), int(x.split(',')[1])] for x in p_h])) - break - else: - pass - - if vv.tag == link + 'Point': - c_t_in.append([int(float(vv.attrib['x'])), int(float(vv.attrib['y']))]) - sumi += 1 - elif vv.tag != link + 'Point' and sumi >= 1: - break - co_printspace.append(np.array(c_t_in)) - img_printspace = np.zeros( (y_len,x_len,3) ) - img_printspace=cv2.fillPoly(img_printspace, pts =co_printspace, color=(1,1,1)) - img_printspace = img_printspace.astype(np.uint8) - - imgray = cv2.cvtColor(img_printspace, cv2.COLOR_BGR2GRAY) - _, thresh = cv2.threshold(imgray, 0, 255, 0) - contours, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - cnt_size = np.array([cv2.contourArea(contours[j]) for j in range(len(contours))]) - cnt = contours[np.argmax(cnt_size)] - x, y, w, h = cv2.boundingRect(cnt) - - bb_coord_printspace = [x, y, w, h] - + ps = (root1.xpath('/pc:PcGts/pc:Page/pc:Border', namespaces=NS) + + root1.xpath('/pc:PcGts/pc:Page/pc:PrintSpace', namespaces=NS)) + if len(ps): + points = ps[0].find('pc:Coords', NS).get('points') + ps_bbox = bbox_from_points(points) else: - bb_coord_printspace = None - + ps_bbox = [0, 0, None, None] region_tags=np.unique([x for x in alltags if x.endswith('Region')]) co_text_paragraph=[] @@ -1749,11 +1676,19 @@ def read_xml(xml_file): img_poly=cv2.fillPoly(img, pts =co_img, color=(4,4,4)) img_poly=cv2.fillPoly(img, pts =co_sep, color=(5,5,5)) - return tree1, root1, bb_coord_printspace, file_name, id_paragraph, id_header+id_heading, co_text_paragraph, co_text_header+co_text_heading,\ -tot_region_ref,x_len, y_len,index_tot_regions, img_poly - - - + return (tree1, + root1, + ps_bbox, + file_name, + id_paragraph, + id_header + id_heading, + co_text_paragraph, + co_text_header + co_text_heading, + tot_region_ref, + x_len, + y_len, + index_tot_regions, + img_poly) def bounding_box(cnt,color, corr_order_index ): x, y, w, h = cv2.boundingRect(cnt) diff --git a/src/eynollah/training/inference.py b/src/eynollah/training/inference.py index 15d1e6a..2ef1a91 100644 --- a/src/eynollah/training/inference.py +++ b/src/eynollah/training/inference.py @@ -196,7 +196,7 @@ class SBBPredict: img_height = self.config_params_model['input_height'] img_width = self.config_params_model['input_width'] - tree_xml, root_xml, bb_coord_printspace, file_name, \ + tree_xml, root_xml, ps_bbox, file_name, \ id_paragraph, id_header, \ co_text_paragraph, co_text_header, \ tot_region_ref, x_len, y_len, index_tot_regions, \ @@ -236,15 +236,13 @@ class SBBPredict: img_label=cv2.fillPoly(img_label, pts =[co_text_all[i]], color=(1,1,1)) labels_con[:,:,i] = img_label[:,:,0] - if bb_coord_printspace: - #bb_coord_printspace[x,y,w,h,_,_] - x = bb_coord_printspace[0] - y = bb_coord_printspace[1] - w = bb_coord_printspace[2] - h = bb_coord_printspace[3] - labels_con = labels_con[y:y+h, x:x+w, :] - img_poly = img_poly[y:y+h, x:x+w, :] - img_header_and_sep = img_header_and_sep[y:y+h, x:x+w] + if ps_bbox: + labels_con = labels_con[ps_bbox[1]:ps_bbox[3], + ps_bbox[0]:ps_bbox[2], :] + img_poly = img_poly[ps_bbox[1]:ps_bbox[3], + ps_bbox[0]:ps_bbox[2], :] + img_header_and_sep = img_header_and_sep[ps_bbox[1]:ps_bbox[3], + ps_bbox[0]:ps_bbox[2]] From e69b35b49c4e7816b0e88d0d5d48f79aaf3f46db Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 28 Jan 2026 13:49:23 +0100 Subject: [PATCH 055/118] training.train.config_params: re-organise to reflect dependencies - re-order keys belonging together logically - make keys dependent on each other --- src/eynollah/training/train.py | 222 +++++++++++++++++---------------- 1 file changed, 115 insertions(+), 107 deletions(-) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index 6353474..e93281a 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -95,136 +95,144 @@ ex = Experiment(save_git_info=False) @ex.config def config_params(): + task = "segmentation" # This parameter defines task of model which can be segmentation, enhancement or classification. + backbone_type = None # Type of image feature map network backbone. Either a vision transformer alongside a CNN we call "transformer", or only a CNN which we call "nontransformer" n_classes = None # Number of classes. In the case of binary classification this should be 2. - n_epochs = 1 # Number of epochs. + n_epochs = 1 # Number of epochs to train. + n_batch = 1 # Number of images per batch at each iteration. (Try as large as fits on VRAM.) input_height = 224 * 1 # Height of model's input in pixels. input_width = 224 * 1 # Width of model's input in pixels. weight_decay = 1e-6 # Weight decay of l2 regularization of model layers. - n_batch = 1 # Number of batches at each iteration. learning_rate = 1e-4 # Set the learning rate. - patches = False # Divides input image into smaller patches (input size of the model) when set to true. For the model to see the full image, like page extraction, set this to false. - augmentation = False # To apply any kind of augmentation, this parameter must be set to true. - flip_aug = False # If true, different types of flipping will be applied to the image. Types of flips are defined with "flip_index" in config_params.json. - blur_aug = False # If true, different types of blurring will be applied to the image. Types of blur are defined with "blur_k" in config_params.json. - padding_white = False # If true, white padding will be applied to the image. - padding_black = False # If true, black padding will be applied to the image. - scaling = False # If true, scaling will be applied to the image. The amount of scaling is defined with "scales" in config_params.json. - shifting = False - degrading = False # If true, degrading will be applied to the image. The amount of degrading is defined with "degrade_scales" in config_params.json. - brightening = False # If true, brightening will be applied to the image. The amount of brightening is defined with "brightness" in config_params.json. - binarization = False # If true, Otsu thresholding will be applied to augment the input with binarized images. - adding_rgb_background = False - adding_rgb_foreground = False - add_red_textlines = False - channels_shuffling = False - dir_train = None # Directory of training dataset with subdirectories having the names "images" and "labels". - dir_eval = None # Directory of validation dataset with subdirectories having the names "images" and "labels". - dir_output = None # Directory where the output model will be saved. - pretraining = False # Set to true to load pretrained weights of ResNet50 encoder. - scaling_bluring = False # If true, a combination of scaling and blurring will be applied to the image. - scaling_binarization = False # If true, a combination of scaling and binarization will be applied to the image. - rotation = False # If true, a 90 degree rotation will be implemeneted. - rotation_not_90 = False # If true rotation based on provided angles with thetha will be implemeneted. - scaling_brightness = False # If true, a combination of scaling and brightening will be applied to the image. - scaling_flip = False # If true, a combination of scaling and flipping will be applied to the image. - thetha = None # Rotate image by these angles for augmentation. - shuffle_indexes = None - blur_k = None # Blur image for augmentation. - scales = None # Scale patches for augmentation. - degrade_scales = None # Degrade image for augmentation. - brightness = None # Brighten image for augmentation. - flip_index = None # Flip image for augmentation. - continue_training = False # Set to true if you would like to continue training an already trained a model. - transformer_patchsize_x = None # Patch size of vision transformer patches in x direction. - transformer_patchsize_y = None # Patch size of vision transformer patches in y direction. - transformer_num_patches_xy = None # Number of patches for vision transformer in x and y direction respectively. - transformer_projection_dim = 64 # Transformer projection dimension. Default value is 64. - transformer_mlp_head_units = [128, 64] # Transformer Multilayer Perceptron (MLP) head units. Default value is [128, 64] - transformer_layers = 8 # transformer layers. Default value is 8. - transformer_num_heads = 4 # Transformer number of heads. Default value is 4. - transformer_cnn_first = True # We have two types of vision transformers. In one type, a CNN is applied first, followed by a transformer. In the other type, this order is reversed. If transformer_cnn_first is true, it means the CNN will be applied before the transformer. Default value is true. - index_start = 0 # Index of model to continue training from. E.g. if you trained for 3 epochs and last index is 2, to continue from model_1.h5, set "index_start" to 3 to start naming model with index 3. - dir_of_start_model = '' # Directory containing pretrained encoder to continue training the model. is_loss_soft_dice = False # Use soft dice as loss function. When set to true, "weighted_loss" must be false. weighted_loss = False # Use weighted categorical cross entropy as loss fucntion. When set to true, "is_loss_soft_dice" must be false. - data_is_provided = False # Only set this to true when you have already provided the input data and the train and eval data are in "dir_output". - task = "segmentation" # This parameter defines task of model which can be segmentation, enhancement or classification. f1_threshold_classification = None # This threshold is used to consider models with an evaluation f1 scores bigger than it. The selected model weights undergo a weights ensembling. And avreage ensembled model will be written to output. classification_classes_name = None # Dictionary of classification classes names. - backbone_type = None # As backbone we have 2 types of backbones. A vision transformer alongside a CNN and we call it "transformer" and only CNN called "nontransformer" - save_interval = None - dir_img_bin = None - number_of_backgrounds_per_image = 1 - dir_rgb_backgrounds = None - dir_rgb_foregrounds = None + patches = False # Divides input image into smaller patches (input size of the model) when set to true. For the model to see the full image, like page extraction, set this to false. + augmentation = False # To apply any kind of augmentation, this parameter must be set to true. + if augmentation: + flip_aug = False # If true, different types of flipping will be applied to the image. Types of flips are defined with "flip_index" in config_params.json. + if flip_aug: + flip_index = None # Flip image for augmentation. + blur_aug = False # If true, different types of blurring will be applied to the image. Types of blur are defined with "blur_k" in config_params.json. + if blur_aug: + blur_k = None # Blur image for augmentation. + padding_white = False # If true, white padding will be applied to the image. + padding_black = False # If true, black padding will be applied to the image. + scaling = False # If true, scaling will be applied to the image. The amount of scaling is defined with "scales" in config_params.json. + scaling_bluring = False # If true, a combination of scaling and blurring will be applied to the image. + scaling_binarization = False # If true, a combination of scaling and binarization will be applied to the image. + scaling_brightness = False # If true, a combination of scaling and brightening will be applied to the image. + scaling_flip = False # If true, a combination of scaling and flipping will be applied to the image. + if scaling or scaling_brightness or scaling_bluring or scaling_binarization or scaling_flip: + scales = None # Scale patches for augmentation. + shifting = False + degrading = False # If true, degrading will be applied to the image. The amount of degrading is defined with "degrade_scales" in config_params.json. + if degrading: + degrade_scales = None # Degrade image for augmentation. + brightening = False # If true, brightening will be applied to the image. The amount of brightening is defined with "brightness" in config_params.json. + if brightening: + brightness = None # Brighten image for augmentation. + binarization = False # If true, Otsu thresholding will be applied to augment the input with binarized images. + if binarization: + dir_img_bin = None # Directory of training dataset subdirectory of binarized images + add_red_textlines = False + adding_rgb_background = False + if adding_rgb_background: + dir_rgb_backgrounds = None # Directory of texture images for synthetic background + adding_rgb_foreground = False + if adding_rgb_foreground: + dir_rgb_foregrounds = None # Directory of texture images for synthetic foreground + if adding_rgb_background or adding_rgb_foreground: + number_of_backgrounds_per_image = 1 + channels_shuffling = False # Re-arrange color channels. + if channels_shuffling: + shuffle_indexes = None # Which channels to switch between. + rotation = False # If true, a 90 degree rotation will be implemeneted. + rotation_not_90 = False # If true rotation based on provided angles with thetha will be implemeneted. + if rotation_not_90: + thetha = None # Rotate image by these angles for augmentation. + dir_train = None # Directory of training dataset with subdirectories having the names "images" and "labels". + dir_eval = None # Directory of validation dataset with subdirectories having the names "images" and "labels". + dir_output = None # Directory where the augmented training data and the model checkpoints will be saved. + pretraining = False # Set to true to (down)load pretrained weights of ResNet50 encoder. + save_interval = None # frequency for writing model checkpoints (nonzero integer for number of batches, or zero for epoch) + continue_training = False # Set to true if you would like to continue training an already trained a model. + dir_of_start_model = '' # Directory containing pretrained encoder to continue training the model. + data_is_provided = False # Only set this to true when you have already provided the input data and the train and eval data are in "dir_output". + if backbone_type == "transformer": + transformer_patchsize_x = None # Patch size of vision transformer patches in x direction. + transformer_patchsize_y = None # Patch size of vision transformer patches in y direction. + transformer_num_patches_xy = None # Number of patches for vision transformer in x and y direction respectively. + transformer_projection_dim = 64 # Transformer projection dimension. Default value is 64. + transformer_mlp_head_units = [128, 64] # Transformer Multilayer Perceptron (MLP) head units. Default value is [128, 64] + transformer_layers = 8 # transformer layers. Default value is 8. + transformer_num_heads = 4 # Transformer number of heads. Default value is 4. + transformer_cnn_first = True # We have two types of vision transformers: either the CNN is applied first, followed by the transformer, or reversed. @ex.automain -def run(_config, n_classes, n_epochs, input_height, - input_width, weight_decay, weighted_loss, - index_start, dir_of_start_model, is_loss_soft_dice, - n_batch, patches, augmentation, flip_aug, - blur_aug, padding_white, padding_black, scaling, shifting, degrading,channels_shuffling, - brightening, binarization, adding_rgb_background, adding_rgb_foreground, add_red_textlines, blur_k, scales, degrade_scales,shuffle_indexes, - brightness, dir_train, data_is_provided, scaling_bluring, - scaling_brightness, scaling_binarization, rotation, rotation_not_90, - thetha, scaling_flip, continue_training, transformer_projection_dim, - transformer_mlp_head_units, transformer_layers, transformer_num_heads, transformer_cnn_first, - transformer_patchsize_x, transformer_patchsize_y, - transformer_num_patches_xy, backbone_type, save_interval, flip_index, dir_eval, dir_output, - pretraining, learning_rate, task, f1_threshold_classification, classification_classes_name, dir_img_bin, number_of_backgrounds_per_image,dir_rgb_backgrounds, dir_rgb_foregrounds): +def run(_config, + _log, + task, + pretraining, + data_is_provided, + dir_train, + dir_eval, + dir_output, + n_classes, + n_epochs, + n_batch, + input_height, + input_width, + is_loss_soft_dice, + weighted_loss, + weight_decay, + learning_rate, + continue_training, + dir_of_start_model, + save_interval, + augmentation, + thetha, + backbone_type, + transformer_projection_dim, + transformer_mlp_head_units, + transformer_layers, + transformer_num_heads, + transformer_cnn_first, + transformer_patchsize_x, + transformer_patchsize_y, + transformer_num_patches_xy, + f1_threshold_classification, + classification_classes_name, +): if pretraining and not os.path.isfile(RESNET50_WEIGHTS_PATH): - print("downloading RESNET50 pretrained weights to", RESNET50_WEIGHTS_PATH) + _log.info("downloading RESNET50 pretrained weights to %s", RESNET50_WEIGHTS_PATH) download_file(RESNET50_WEIGHTS_URL, RESNET50_WEIGHTS_PATH) - - if dir_rgb_backgrounds: - list_all_possible_background_images = os.listdir(dir_rgb_backgrounds) - else: - list_all_possible_background_images = None - - if dir_rgb_foregrounds: - list_all_possible_foreground_rgbs = os.listdir(dir_rgb_foregrounds) - else: - list_all_possible_foreground_rgbs = None - + + # set the gpu configuration + configuration() + if task in ["segmentation", "enhancement", "binarization"]: - if data_is_provided: - dir_train_flowing = os.path.join(dir_output, 'train') - dir_eval_flowing = os.path.join(dir_output, 'eval') - - dir_flow_train_imgs = os.path.join(dir_train_flowing, 'images') - dir_flow_train_labels = os.path.join(dir_train_flowing, 'labels') + dir_train_flowing = os.path.join(dir_output, 'train') + dir_eval_flowing = os.path.join(dir_output, 'eval') - dir_flow_eval_imgs = os.path.join(dir_eval_flowing, 'images') - dir_flow_eval_labels = os.path.join(dir_eval_flowing, 'labels') + dir_flow_train_imgs = os.path.join(dir_train_flowing, 'images') + dir_flow_train_labels = os.path.join(dir_train_flowing, 'labels') - configuration() - - else: - dir_img, dir_seg = get_dirs_or_files(dir_train) - dir_img_val, dir_seg_val = get_dirs_or_files(dir_eval) - - # make first a directory in output for both training and evaluations in order to flow data from these directories. - dir_train_flowing = os.path.join(dir_output, 'train') - dir_eval_flowing = os.path.join(dir_output, 'eval') - - dir_flow_train_imgs = os.path.join(dir_train_flowing, 'images/') - dir_flow_train_labels = os.path.join(dir_train_flowing, 'labels/') - - dir_flow_eval_imgs = os.path.join(dir_eval_flowing, 'images/') - dir_flow_eval_labels = os.path.join(dir_eval_flowing, 'labels/') + dir_flow_eval_imgs = os.path.join(dir_eval_flowing, 'images') + dir_flow_eval_labels = os.path.join(dir_eval_flowing, 'labels') + if not data_is_provided: + # first create a directory in output for both training and evaluations + # in order to flow data from these directories. if os.path.isdir(dir_train_flowing): os.system('rm -rf ' + dir_train_flowing) - os.makedirs(dir_train_flowing) - else: - os.makedirs(dir_train_flowing) + os.makedirs(dir_train_flowing) if os.path.isdir(dir_eval_flowing): os.system('rm -rf ' + dir_eval_flowing) - os.makedirs(dir_eval_flowing) - else: - os.makedirs(dir_eval_flowing) + os.makedirs(dir_eval_flowing) os.mkdir(dir_flow_train_imgs) os.mkdir(dir_flow_train_labels) From 29a0f19cee579665d5edfaa8b3d2bbc8e3bb31b0 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 28 Jan 2026 13:53:11 +0100 Subject: [PATCH 056/118] =?UTF-8?q?training:=20simplify=20image=20preproce?= =?UTF-8?q?ssing=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - `utils.provide_patches`: split up loop into * `utils.preprocess_img` (single img function) * `utils.preprocess_imgs` (top-level loop) - capture exceptions for all cases (not just some) at top level and with informative logging - avoid repeating / delegating config keys in several places: only as kwargs to `preprocess_img()` - read files into memory only once, then re-use - improve readability (avoiding long lines, repeated code) --- src/eynollah/training/train.py | 81 ++-- src/eynollah/training/utils.py | 799 ++++++++++++++++++++------------- 2 files changed, 510 insertions(+), 370 deletions(-) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index e93281a..9c638ea 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -26,7 +26,7 @@ from eynollah.training.utils import ( generate_data_from_folder_evaluation, generate_data_from_folder_training, get_one_hot, - provide_patches, + preprocess_imgs, return_number_of_total_training_data ) @@ -240,9 +240,9 @@ def run(_config, os.mkdir(dir_flow_eval_imgs) os.mkdir(dir_flow_eval_labels) - # set the gpu configuration - configuration() - + dir_img, dir_seg = get_dirs_or_files(dir_train) + dir_img_val, dir_seg_val = get_dirs_or_files(dir_eval) + imgs_list=np.array(os.listdir(dir_img)) segs_list=np.array(os.listdir(dir_seg)) @@ -250,50 +250,21 @@ def run(_config, segs_list_test=np.array(os.listdir(dir_seg_val)) # writing patches into a sub-folder in order to be flowed from directory. - common_args = [input_height, input_width, - blur_k, blur_aug, - padding_white, padding_black, - flip_aug, binarization, - adding_rgb_background, - adding_rgb_foreground, - add_red_textlines, - channels_shuffling, - scaling, shifting, degrading, brightening, - scales, degrade_scales, brightness, - flip_index, shuffle_indexes, - scaling_bluring, scaling_brightness, scaling_binarization, - rotation, rotation_not_90, thetha, - scaling_flip, task, - ] - common_kwargs = dict(patches= - patches, - dir_img_bin= - dir_img_bin, - number_of_backgrounds_per_image= - number_of_backgrounds_per_image, - list_all_possible_background_images= - list_all_possible_background_images, - dir_rgb_backgrounds= - dir_rgb_backgrounds, - dir_rgb_foregrounds= - dir_rgb_foregrounds, - list_all_possible_foreground_rgbs= - list_all_possible_foreground_rgbs, - ) - provide_patches(imgs_list, segs_list, - dir_img, dir_seg, + preprocess_imgs(_config, + imgs_list, + segs_list, + dir_img, + dir_seg, dir_flow_train_imgs, - dir_flow_train_labels, - *common_args, - augmentation=augmentation, - **common_kwargs) - provide_patches(imgs_list_test, segs_list_test, - dir_img_val, dir_seg_val, + dir_flow_train_labels) + preprocess_imgs(_config, + imgs_list_test, + segs_list_test, + dir_img_val, + dir_seg_val, dir_flow_eval_imgs, dir_flow_eval_labels, - *common_args, - augmentation=False, - **common_kwargs) + augmentation=False) if weighted_loss: weights = np.zeros(n_classes) @@ -307,8 +278,8 @@ def run(_config, label_obj = cv2.imread(label_file) label_obj_one_hot = get_one_hot(label_obj, label_obj.shape[0], label_obj.shape[1], n_classes) weights += (label_obj_one_hot.sum(axis=0)).sum(axis=0) - except Exception as e: - print("error reading data file '%s': %s" % (label_file, e), file=sys.stderr) + except Exception: + _log.exception("error reading data file '%s'", label_file) weights = 1.00 / weights weights = weights / float(np.sum(weights)) @@ -340,7 +311,6 @@ def run(_config, custom_objects = {"PatchEncoder": PatchEncoder, "Patches": Patches}) else: - index_start = 0 if backbone_type == 'nontransformer': model = resnet50_unet(n_classes, input_height, @@ -391,7 +361,7 @@ def run(_config, pretraining) #if you want to see the model structure just uncomment model summary. - model.summary() + #model.summary() if task in ["segmentation", "binarization"]: if is_loss_soft_dice: @@ -423,7 +393,12 @@ def run(_config, SaveWeightsAfterSteps(0, dir_output, _config)] if save_interval: callbacks.append(SaveWeightsAfterSteps(save_interval, dir_output, _config)) - + + _log.info("training on %d batches in %d epochs", + len(os.listdir(dir_flow_train_imgs)) // n_batch - 1, + n_epochs) + _log.info("validating on %d batches", + len(os.listdir(dir_flow_eval_imgs)) // n_batch - 1) model.fit( train_gen, steps_per_epoch=len(os.listdir(dir_flow_train_imgs)) // n_batch - 1, @@ -439,7 +414,6 @@ def run(_config, #model.save(dir_output+'/'+'model'+'.h5') elif task=='classification': - configuration() model = resnet50_classifier(n_classes, input_height, input_width, @@ -474,7 +448,7 @@ def run(_config, usable_checkpoints = np.flatnonzero(np.array(history['val_f1']) > f1_threshold_classification) if len(usable_checkpoints) >= 1: - print("averaging over usable checkpoints", usable_checkpoints) + _log.info("averaging over usable checkpoints: %s", str(usable_checkpoints)) all_weights = [] for epoch in usable_checkpoints: cp_path = os.path.join(dir_output, 'model_{epoch:02d}'.format(epoch=epoch)) @@ -495,10 +469,9 @@ def run(_config, model.save(cp_path) with open(os.path.join(cp_path, "config.json"), "w") as fp: json.dump(_config, fp) # encode dict into JSON - print("ensemble model saved under", cp_path) + _log.info("ensemble model saved under '%s'", cp_path) elif task=='reading_order': - configuration() model = machine_based_reading_order_model( n_classes, input_height, input_width, weight_decay, pretraining) diff --git a/src/eynollah/training/utils.py b/src/eynollah/training/utils.py index 1278be5..61b2536 100644 --- a/src/eynollah/training/utils.py +++ b/src/eynollah/training/utils.py @@ -1,6 +1,7 @@ import os import math import random +from logging import getLogger import cv2 import numpy as np @@ -266,8 +267,9 @@ def generate_data_from_folder_training(path_classes, batchsize, height, width, n ret_y= np.zeros((batchsize, n_classes)).astype(np.int16) batchcount = 0 -def do_brightening(img_in_dir, factor): - im = Image.open(img_in_dir) +def do_brightening(img, factor): + img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) + im = Image.fromarray(img_rgb) enhancer = ImageEnhance.Brightness(im) out_img = enhancer.enhance(factor) out_img = out_img.convert('RGB') @@ -737,321 +739,486 @@ def get_patches_num_scale_new(dir_img_f, dir_seg_f, img, label, height, width, i return indexer -def provide_patches(imgs_list_train, segs_list_train, dir_img, dir_seg, dir_flow_train_imgs, - dir_flow_train_labels, input_height, input_width, blur_k, blur_aug, - padding_white, padding_black, flip_aug, binarization, adding_rgb_background, adding_rgb_foreground, add_red_textlines, channels_shuffling, scaling, shifting, degrading, - brightening, scales, degrade_scales, brightness, flip_index, shuffle_indexes, - scaling_bluring, scaling_brightness, scaling_binarization, rotation, - rotation_not_90, thetha, scaling_flip, task, augmentation=False, patches=False, dir_img_bin=None,number_of_backgrounds_per_image=None,list_all_possible_background_images=None, dir_rgb_backgrounds=None, dir_rgb_foregrounds=None, list_all_possible_foreground_rgbs=None): - +def preprocess_imgs(config, + imgs_list, + segs_list, + dir_img, + dir_seg, + dir_flow_imgs, + dir_flow_labels, + logger=None, + **kwargs, +): + if logger is None: + logger = getLogger('') + + # make a copy for this run + config = dict(config) + # add derived keys not part of config + if config.get('dir_rgb_backgrounds', None): + config['list_all_possible_background_images'] = \ + os.listdir(config['dir_rgb_backgrounds']) + if config.get('dir_rgb_foregrounds', None): + config['list_all_possible_foreground_rgbs'] = \ + os.listdir(config['dir_rgb_foregrounds']) + # override keys from call + config.update(kwargs) + indexer = 0 - for im, seg_i in tqdm(zip(imgs_list_train, segs_list_train)): + for im, seg_i in tqdm(zip(imgs_list, segs_list)): + img = cv2.imread(os.path.join(dir_img, im)) img_name = os.path.splitext(im)[0] - if task == "segmentation" or task == "binarization": - dir_of_label_file = os.path.join(dir_seg, img_name + '.png') - elif task=="enhancement": - dir_of_label_file = os.path.join(dir_seg, im) - - if not patches: - cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', resize_image(cv2.imread(dir_img + '/' + im), input_height, input_width)) - cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', resize_image(cv2.imread(dir_of_label_file), input_height, input_width)) - indexer += 1 - - if augmentation: - if flip_aug: - for f_i in flip_index: - cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', - resize_image(cv2.flip(cv2.imread(dir_img+'/'+im),f_i),input_height,input_width) ) - - cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', - resize_image(cv2.flip(cv2.imread(dir_of_label_file), f_i), input_height, input_width)) - indexer += 1 - - if blur_aug: - for blur_i in blur_k: - cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', - (resize_image(bluring(cv2.imread(dir_img + '/' + im), blur_i), input_height, input_width))) - - cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', - resize_image(cv2.imread(dir_of_label_file), input_height, input_width)) - indexer += 1 - if brightening: - for factor in brightness: - try: - cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', - (resize_image(do_brightening(dir_img + '/' +im, factor), input_height, input_width))) - - cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', - resize_image(cv2.imread(dir_of_label_file), input_height, input_width)) - indexer += 1 - except: - pass - - if binarization: - - if dir_img_bin: - img_bin_corr = cv2.imread(dir_img_bin + '/' + img_name+'.png') - - cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', - resize_image(img_bin_corr, input_height, input_width)) - else: - cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', - resize_image(otsu_copy(cv2.imread(dir_img + '/' + im)), input_height, input_width)) - - cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', - resize_image(cv2.imread(dir_of_label_file), input_height, input_width)) - indexer += 1 - - if degrading: - for degrade_scale_ind in degrade_scales: - cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', - (resize_image(do_degrading(cv2.imread(dir_img + '/' + im), degrade_scale_ind), input_height, input_width))) - - cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', - resize_image(cv2.imread(dir_of_label_file), input_height, input_width)) - indexer += 1 - - if rotation_not_90: - for thetha_i in thetha: - img_max_rotated, label_max_rotated = rotation_not_90_func(cv2.imread(dir_img + '/'+im), - cv2.imread(dir_of_label_file), thetha_i) - - cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', resize_image(img_max_rotated, input_height, input_width)) - - cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', resize_image(label_max_rotated, input_height, input_width)) - indexer += 1 - - if channels_shuffling: - for shuffle_index in shuffle_indexes: - cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', - (resize_image(return_shuffled_channels(cv2.imread(dir_img + '/' + im), shuffle_index), input_height, input_width))) - - cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', - resize_image(cv2.imread(dir_of_label_file), input_height, input_width)) - indexer += 1 - - if scaling: - for sc_ind in scales: - img_scaled, label_scaled = scale_image_for_no_patch(cv2.imread(dir_img + '/'+im), - cv2.imread(dir_of_label_file), sc_ind) - - cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', resize_image(img_scaled, input_height, input_width)) - cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', resize_image(label_scaled, input_height, input_width)) - indexer += 1 - if shifting: - shift_types = ['xpos', 'xmin', 'ypos', 'ymin', 'xypos', 'xymin'] - for st_ind in shift_types: - img_shifted, label_shifted = shift_image_and_label(cv2.imread(dir_img + '/'+im), - cv2.imread(dir_of_label_file), st_ind) - - cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', resize_image(img_shifted, input_height, input_width)) - cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', resize_image(label_shifted, input_height, input_width)) - indexer += 1 - - - if adding_rgb_background: - img_bin_corr = cv2.imread(dir_img_bin + '/' + img_name+'.png') - for i_n in range(number_of_backgrounds_per_image): - background_image_chosen_name = random.choice(list_all_possible_background_images) - img_rgb_background_chosen = cv2.imread(dir_rgb_backgrounds + '/' + background_image_chosen_name) - img_with_overlayed_background = return_binary_image_with_given_rgb_background(img_bin_corr, img_rgb_background_chosen) - - cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', resize_image(img_with_overlayed_background, input_height, input_width)) - cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', - resize_image(cv2.imread(dir_of_label_file), input_height, input_width)) - - indexer += 1 - - if adding_rgb_foreground: - img_bin_corr = cv2.imread(dir_img_bin + '/' + img_name+'.png') - for i_n in range(number_of_backgrounds_per_image): - background_image_chosen_name = random.choice(list_all_possible_background_images) - foreground_rgb_chosen_name = random.choice(list_all_possible_foreground_rgbs) - - img_rgb_background_chosen = cv2.imread(dir_rgb_backgrounds + '/' + background_image_chosen_name) - foreground_rgb_chosen = np.load(dir_rgb_foregrounds + '/' + foreground_rgb_chosen_name) - - img_with_overlayed_background = return_binary_image_with_given_rgb_background_and_given_foreground_rgb(img_bin_corr, img_rgb_background_chosen, foreground_rgb_chosen) - - cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', resize_image(img_with_overlayed_background, input_height, input_width)) - cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', - resize_image(cv2.imread(dir_of_label_file), input_height, input_width)) - - indexer += 1 - - if add_red_textlines: - img_bin_corr = cv2.imread(dir_img_bin + '/' + img_name+'.png') - img_red_context = return_image_with_red_elements(cv2.imread(dir_img + '/'+im), img_bin_corr) - - cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', resize_image(img_red_context, input_height, input_width)) - cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', - resize_image(cv2.imread(dir_of_label_file), input_height, input_width)) - - indexer += 1 - - - - - if patches: - indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, - cv2.imread(dir_img + '/' + im), cv2.imread(dir_of_label_file), - input_height, input_width, indexer=indexer) - - if augmentation: - if rotation: - indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, - rotation_90(cv2.imread(dir_img + '/' + im)), - rotation_90(cv2.imread(dir_of_label_file)), - input_height, input_width, indexer=indexer) - - if rotation_not_90: - for thetha_i in thetha: - img_max_rotated, label_max_rotated = rotation_not_90_func(cv2.imread(dir_img + '/'+im), - cv2.imread(dir_of_label_file), thetha_i) - indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, - img_max_rotated, - label_max_rotated, - input_height, input_width, indexer=indexer) - - if channels_shuffling: - for shuffle_index in shuffle_indexes: - indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, - return_shuffled_channels(cv2.imread(dir_img + '/' + im), shuffle_index), - cv2.imread(dir_of_label_file), - input_height, input_width, indexer=indexer) - - if adding_rgb_background: - img_bin_corr = cv2.imread(dir_img_bin + '/' + img_name+'.png') - for i_n in range(number_of_backgrounds_per_image): - background_image_chosen_name = random.choice(list_all_possible_background_images) - img_rgb_background_chosen = cv2.imread(dir_rgb_backgrounds + '/' + background_image_chosen_name) - img_with_overlayed_background = return_binary_image_with_given_rgb_background(img_bin_corr, img_rgb_background_chosen) - - indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, - img_with_overlayed_background, - cv2.imread(dir_of_label_file), - input_height, input_width, indexer=indexer) - - - if adding_rgb_foreground: - img_bin_corr = cv2.imread(dir_img_bin + '/' + img_name+'.png') - for i_n in range(number_of_backgrounds_per_image): - background_image_chosen_name = random.choice(list_all_possible_background_images) - foreground_rgb_chosen_name = random.choice(list_all_possible_foreground_rgbs) - - img_rgb_background_chosen = cv2.imread(dir_rgb_backgrounds + '/' + background_image_chosen_name) - foreground_rgb_chosen = np.load(dir_rgb_foregrounds + '/' + foreground_rgb_chosen_name) - - img_with_overlayed_background = return_binary_image_with_given_rgb_background_and_given_foreground_rgb(img_bin_corr, img_rgb_background_chosen, foreground_rgb_chosen) - - indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, - img_with_overlayed_background, - cv2.imread(dir_of_label_file), - input_height, input_width, indexer=indexer) - - - if add_red_textlines: - img_bin_corr = cv2.imread(dir_img_bin + '/' + img_name+'.png') - img_red_context = return_image_with_red_elements(cv2.imread(dir_img + '/'+im), img_bin_corr) - - indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, - img_red_context, - cv2.imread(dir_of_label_file), - input_height, input_width, indexer=indexer) - - if flip_aug: - for f_i in flip_index: - indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, - cv2.flip(cv2.imread(dir_img + '/' + im), f_i), - cv2.flip(cv2.imread(dir_of_label_file), f_i), - input_height, input_width, indexer=indexer) - if blur_aug: - for blur_i in blur_k: - indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, - bluring(cv2.imread(dir_img + '/' + im), blur_i), - cv2.imread(dir_of_label_file), - input_height, input_width, indexer=indexer) - if padding_black: - indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, - do_padding_black(cv2.imread(dir_img + '/' + im)), - do_padding_label(cv2.imread(dir_of_label_file)), - input_height, input_width, indexer=indexer) - - if padding_white: - indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, - do_padding_white(cv2.imread(dir_img + '/'+im)), - do_padding_label(cv2.imread(dir_of_label_file)), - input_height, input_width, indexer=indexer) - - if brightening: - for factor in brightness: - try: - indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, - do_brightening(dir_img + '/' +im, factor), - cv2.imread(dir_of_label_file), - input_height, input_width, indexer=indexer) - except: - pass - if scaling: - for sc_ind in scales: - indexer = get_patches_num_scale_new(dir_flow_train_imgs, dir_flow_train_labels, - cv2.imread(dir_img + '/' + im) , - cv2.imread(dir_of_label_file), - input_height, input_width, indexer=indexer, scaler=sc_ind) - - if degrading: - for degrade_scale_ind in degrade_scales: - indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, - do_degrading(cv2.imread(dir_img + '/' + im), degrade_scale_ind), - cv2.imread(dir_of_label_file), - input_height, input_width, indexer=indexer) - - if binarization: - if dir_img_bin: - img_bin_corr = cv2.imread(dir_img_bin + '/' + img_name+'.png') - - indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, - img_bin_corr, - cv2.imread(dir_of_label_file), - input_height, input_width, indexer=indexer) - - else: - indexer = get_patches(dir_flow_train_imgs, dir_flow_train_labels, - otsu_copy(cv2.imread(dir_img + '/' + im)), - cv2.imread(dir_of_label_file), - input_height, input_width, indexer=indexer) + if config['task'] in ["segmentation", "binarization"]: + lab = cv2.imread(os.path.join(dir_seg, img_name + '.png')) + elif config['task'] == "enhancement": + lab = cv2.imread(os.path.join(dir_seg, im)) + else: + lab = None - if scaling_brightness: - for sc_ind in scales: - for factor in brightness: - try: - indexer = get_patches_num_scale_new(dir_flow_train_imgs, - dir_flow_train_labels, - do_brightening(dir_img + '/' + im, factor) - ,cv2.imread(dir_of_label_file) - ,input_height, input_width, indexer=indexer, scaler=sc_ind) - except: - pass - - if scaling_bluring: - for sc_ind in scales: - for blur_i in blur_k: - indexer = get_patches_num_scale_new(dir_flow_train_imgs, dir_flow_train_labels, - bluring(cv2.imread(dir_img + '/' + im), blur_i), - cv2.imread(dir_of_label_file), - input_height, input_width, indexer=indexer, scaler=sc_ind) + try: + indexer = preprocess_img(indexer, img, img_name, lab, + dir_flow_imgs, + dir_flow_labels, + **config) - if scaling_binarization: - for sc_ind in scales: - indexer = get_patches_num_scale_new(dir_flow_train_imgs, dir_flow_train_labels, - otsu_copy(cv2.imread(dir_img + '/' + im)), - cv2.imread(dir_of_label_file), - input_height, input_width, indexer=indexer, scaler=sc_ind) - - if scaling_flip: - for sc_ind in scales: - for f_i in flip_index: - indexer = get_patches_num_scale_new(dir_flow_train_imgs, dir_flow_train_labels, - cv2.flip( cv2.imread(dir_img + '/' + im), f_i), - cv2.flip(cv2.imread(dir_of_label_file), f_i), - input_height, input_width, indexer=indexer, scaler=sc_ind) + except: + logger.exception("skipping image %s", img_name) + +def preprocess_img(indexer, + img, + img_name, + lab, + dir_flow_train_imgs, + dir_flow_train_labels, + input_height=None, + input_width=None, + augmentation=False, + flip_aug=False, + flip_index=None, + blur_aug=False, + blur_k=None, + padding_white=False, + padding_black=False, + scaling=False, + scaling_bluring=False, + scaling_brightness=False, + scaling_binarization=False, + scaling_flip=False, + scales=None, + shifting=False, + degrading=False, + degrade_scales=None, + brightening=False, + brightness=None, + binarization=False, + dir_img_bin=None, + add_red_textlines=False, + adding_rgb_background=False, + dir_rgb_backgrounds=None, + adding_rgb_foreground=False, + dir_rgb_foregrounds=None, + number_of_backgrounds_per_image=None, + channels_shuffling=False, + shuffle_indexes=None, + rotation=False, + rotation_not_90=False, + thetha=None, + patches=False, + list_all_possible_background_images=None, + list_all_possible_foreground_rgbs=None, + **kwargs, +): + if not patches: + cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', + resize_image(img, + input_height, + input_width)) + cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', + resize_image(lab, + input_height, + input_width)) + indexer += 1 + if augmentation: + if flip_aug: + for f_i in flip_index: + cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', + resize_image(cv2.flip(img, f_i), + input_height, + input_width)) + cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', + resize_image(cv2.flip(lab, f_i), + input_height, + input_width)) + indexer += 1 + if blur_aug: + for blur_i in blur_k: + cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', + (resize_image(bluring(img, blur_i), + input_height, + input_width))) + cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', + resize_image(lab, + input_height, + input_width)) + indexer += 1 + if brightening: + for factor in brightness: + cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', + (resize_image(do_brightening(img, factor), + input_height, + input_width))) + cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', + resize_image(lab, + input_height, + input_width)) + indexer += 1 + if binarization: + if dir_img_bin: + img_bin_corr = cv2.imread(dir_img_bin + '/' + img_name+'.png') + cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', + resize_image(img_bin_corr, + input_height, + input_width)) + else: + cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', + resize_image(otsu_copy(img), + input_height, + input_width)) + cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', + resize_image(lab, + input_height, + input_width)) + indexer += 1 + if degrading: + for degrade_scale_ind in degrade_scales: + cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', + (resize_image(do_degrading(img, degrade_scale_ind), + input_height, + input_width))) + cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', + resize_image(lab, + input_height, + input_width)) + indexer += 1 + if rotation_not_90: + for thetha_i in thetha: + img_max_rotated, label_max_rotated = \ + rotation_not_90_func(img, lab, thetha_i) + cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', + resize_image(img_max_rotated, + input_height, + input_width)) + cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', + resize_image(label_max_rotated, + input_height, + input_width)) + indexer += 1 + if channels_shuffling: + for shuffle_index in shuffle_indexes: + cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', + (resize_image(return_shuffled_channels(img, shuffle_index), + input_height, + input_width))) + cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', + resize_image(lab, + input_height, + input_width)) + indexer += 1 + if scaling: + for sc_ind in scales: + img_scaled, label_scaled = \ + scale_image_for_no_patch(img, lab, sc_ind) + cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', + resize_image(img_scaled, + input_height, + input_width)) + cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', + resize_image(label_scaled, + input_height, + input_width)) + indexer += 1 + if shifting: + shift_types = ['xpos', 'xmin', 'ypos', 'ymin', 'xypos', 'xymin'] + for st_ind in shift_types: + img_shifted, label_shifted = \ + shift_image_and_label(img, lab, st_ind) + cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', + resize_image(img_shifted, + input_height, + input_width)) + cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', + resize_image(label_shifted, + input_height, + input_width)) + indexer += 1 + if adding_rgb_background: + img_bin_corr = cv2.imread(dir_img_bin + '/' + img_name+'.png') + for i_n in range(number_of_backgrounds_per_image): + background_image_chosen_name = random.choice(list_all_possible_background_images) + img_rgb_background_chosen = \ + cv2.imread(dir_rgb_backgrounds + '/' + background_image_chosen_name) + img_with_overlayed_background = \ + return_binary_image_with_given_rgb_background( + img_bin_corr, img_rgb_background_chosen) + cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', + resize_image(img_with_overlayed_background, + input_height, + input_width)) + cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', + resize_image(lab, + input_height, + input_width)) + indexer += 1 + if adding_rgb_foreground: + img_bin_corr = cv2.imread(dir_img_bin + '/' + img_name+'.png') + for i_n in range(number_of_backgrounds_per_image): + background_image_chosen_name = random.choice(list_all_possible_background_images) + foreground_rgb_chosen_name = random.choice(list_all_possible_foreground_rgbs) + img_rgb_background_chosen = \ + cv2.imread(dir_rgb_backgrounds + '/' + background_image_chosen_name) + foreground_rgb_chosen = \ + np.load(dir_rgb_foregrounds + '/' + foreground_rgb_chosen_name) + img_with_overlayed_background = \ + return_binary_image_with_given_rgb_background_and_given_foreground_rgb( + img_bin_corr, img_rgb_background_chosen, foreground_rgb_chosen) + cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', + resize_image(img_with_overlayed_background, + input_height, + input_width)) + cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', + resize_image(lab, + input_height, + input_width)) + indexer += 1 + if add_red_textlines: + img_bin_corr = cv2.imread(dir_img_bin + '/' + img_name+'.png') + img_red_context = \ + return_image_with_red_elements(img, img_bin_corr) + cv2.imwrite(dir_flow_train_imgs + '/img_' + str(indexer) + '.png', + resize_image(img_red_context, + input_height, + input_width)) + cv2.imwrite(dir_flow_train_labels + '/img_' + str(indexer) + '.png', + resize_image(lab, + input_height, + input_width)) + indexer += 1 + else: + indexer = get_patches(dir_flow_train_imgs, + dir_flow_train_labels, + img, + lab, + input_height, + input_width, + indexer=indexer) + if augmentation: + if rotation: + indexer = get_patches(dir_flow_train_imgs, + dir_flow_train_labels, + rotation_90(img), + rotation_90(lab), + input_height, + input_width, + indexer=indexer) + if rotation_not_90: + for thetha_i in thetha: + img_max_rotated, label_max_rotated = \ + rotation_not_90_func(img, lab, thetha_i) + indexer = get_patches(dir_flow_train_imgs, + dir_flow_train_labels, + img_max_rotated, + label_max_rotated, + input_height, + input_width, + indexer=indexer) + if channels_shuffling: + for shuffle_index in shuffle_indexes: + img_shuffled = \ + return_shuffled_channels(img, shuffle_index), + indexer = get_patches(dir_flow_train_imgs, + dir_flow_train_labels, + img_shuffled, + lab, + input_height, + input_width, + indexer=indexer) + if adding_rgb_background: + img_bin_corr = cv2.imread(dir_img_bin + '/' + img_name+'.png') + for i_n in range(number_of_backgrounds_per_image): + background_image_chosen_name = random.choice(list_all_possible_background_images) + img_rgb_background_chosen = \ + cv2.imread(dir_rgb_backgrounds + '/' + background_image_chosen_name) + img_with_overlayed_background = \ + return_binary_image_with_given_rgb_background( + img_bin_corr, img_rgb_background_chosen) + indexer = get_patches(dir_flow_train_imgs, + dir_flow_train_labels, + img_with_overlayed_background, + lab, + input_height, + input_width, + indexer=indexer) + if adding_rgb_foreground: + img_bin_corr = cv2.imread(dir_img_bin + '/' + img_name+'.png') + for i_n in range(number_of_backgrounds_per_image): + background_image_chosen_name = random.choice(list_all_possible_background_images) + foreground_rgb_chosen_name = random.choice(list_all_possible_foreground_rgbs) + img_rgb_background_chosen = \ + cv2.imread(dir_rgb_backgrounds + '/' + background_image_chosen_name) + foreground_rgb_chosen = \ + np.load(dir_rgb_foregrounds + '/' + foreground_rgb_chosen_name) + img_with_overlayed_background = \ + return_binary_image_with_given_rgb_background_and_given_foreground_rgb( + img_bin_corr, img_rgb_background_chosen, foreground_rgb_chosen) + indexer = get_patches(dir_flow_train_imgs, + dir_flow_train_labels, + img_with_overlayed_background, + lab, + input_height, + input_width, + indexer=indexer) + if add_red_textlines: + img_bin_corr = cv2.imread(dir_img_bin + '/' + img_name+'.png') + img_red_context = \ + return_image_with_red_elements(img, img_bin_corr) + indexer = get_patches(dir_flow_train_imgs, + dir_flow_train_labels, + img_red_context, + lab, + input_height, + input_width, + indexer=indexer) + if flip_aug: + for f_i in flip_index: + indexer = get_patches(dir_flow_train_imgs, + dir_flow_train_labels, + cv2.flip(img, f_i), + cv2.flip(lab, f_i), + input_height, + input_width, + indexer=indexer) + if blur_aug: + for blur_i in blur_k: + indexer = get_patches(dir_flow_train_imgs, + dir_flow_train_labels, + bluring(img, blur_i), + lab, + input_height, + input_width, + indexer=indexer) + if padding_black: + indexer = get_patches(dir_flow_train_imgs, + dir_flow_train_labels, + do_padding_black(img), + do_padding_label(lab), + input_height, + input_width, + indexer=indexer) + if padding_white: + indexer = get_patches(dir_flow_train_imgs, + dir_flow_train_labels, + do_padding_white(img), + do_padding_label(lab), + input_height, + input_width, + indexer=indexer) + if brightening: + for factor in brightness: + indexer = get_patches(dir_flow_train_imgs, + dir_flow_train_labels, + do_brightening(img, factor), + lab, + input_height, + input_width, + indexer=indexer) + if scaling: + for sc_ind in scales: + indexer = get_patches_num_scale_new( + dir_flow_train_imgs, + dir_flow_train_labels, + img , + lab, + input_height, + input_width, + indexer=indexer, + scaler=sc_ind) + if degrading: + for degrade_scale_ind in degrade_scales: + img_deg = \ + do_degrading(img, degrade_scale_ind), + indexer = get_patches(dir_flow_train_imgs, + dir_flow_train_labels, + img_deg, + lab, + input_height, + input_width, + indexer=indexer) + if binarization: + if dir_img_bin: + img_bin_corr = cv2.imread(dir_img_bin + '/' + img_name+'.png') + indexer = get_patches(dir_flow_train_imgs, + dir_flow_train_labels, + img_bin_corr, + lab, + input_height, + input_width, + indexer=indexer) + else: + indexer = get_patches(dir_flow_train_imgs, + dir_flow_train_labels, + otsu_copy(img), + lab, + input_height, + input_width, + indexer=indexer) + if scaling_brightness: + for sc_ind in scales: + for factor in brightness: + img_bright = do_brightening(img, factor) + indexer = get_patches_num_scale_new( + dir_flow_train_imgs, + dir_flow_train_labels, + img_bright, + lab, + input_height, + input_width, + indexer=indexer, + scaler=sc_ind) + if scaling_bluring: + for sc_ind in scales: + for blur_i in blur_k: + img_blur = bluring(img, blur_i), + indexer = get_patches_num_scale_new( + dir_flow_train_imgs, + dir_flow_train_labels, + img_blur, + lab, + input_height, + input_width, + indexer=indexer, + scaler=sc_ind) + if scaling_binarization: + for sc_ind in scales: + img_bin = otsu_copy(img), + indexer = get_patches_num_scale_new( + dir_flow_train_imgs, + dir_flow_train_labels, + img_bin, + lab, + input_height, + input_width, + indexer=indexer, + scaler=sc_ind) + if scaling_flip: + for sc_ind in scales: + for f_i in flip_index: + indexer = get_patches_num_scale_new( + dir_flow_train_imgs, + dir_flow_train_labels, + cv2.flip(img, f_i), + cv2.flip(lab, f_i), + input_height, + input_width, + indexer=indexer, + scaler=sc_ind) + return indexer From d1e8a02fd4a50d61d3101db8a9ae870201bde194 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 29 Jan 2026 03:01:14 +0100 Subject: [PATCH 057/118] training: fix epoch size calculation --- src/eynollah/training/train.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index 9c638ea..1e2ab3e 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -394,17 +394,16 @@ def run(_config, if save_interval: callbacks.append(SaveWeightsAfterSteps(save_interval, dir_output, _config)) - _log.info("training on %d batches in %d epochs", - len(os.listdir(dir_flow_train_imgs)) // n_batch - 1, - n_epochs) - _log.info("validating on %d batches", - len(os.listdir(dir_flow_eval_imgs)) // n_batch - 1) + steps_train = len(os.listdir(dir_flow_train_imgs)) // n_batch # - 1 + steps_val = len(os.listdir(dir_flow_eval_imgs)) // n_batch + _log.info("training on %d batches in %d epochs", steps_train, n_epochs) + _log.info("validating on %d batches", steps_val) model.fit( train_gen, - steps_per_epoch=len(os.listdir(dir_flow_train_imgs)) // n_batch - 1, + steps_per_epoch=steps_train, validation_data=val_gen, #validation_steps=1, # rs: only one batch?? - validation_steps=len(os.listdir(dir_flow_eval_imgs)) // n_batch - 1, + validation_steps=steps_val, epochs=n_epochs, callbacks=callbacks) From 25153ad307a6ea658dee8d3be19250969530cdfc Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 29 Jan 2026 12:19:09 +0100 Subject: [PATCH 058/118] training: add IoU metric --- src/eynollah/training/train.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index 1e2ab3e..344522a 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -34,6 +34,7 @@ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' os.environ['TF_USE_LEGACY_KERAS'] = '1' # avoid Keras 3 after TF 2.15 import tensorflow as tf from tensorflow.keras.optimizers import SGD, Adam +from tensorflow.keras.metrics import MeanIoU from tensorflow.keras.models import load_model from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard from sacred import Experiment @@ -374,7 +375,11 @@ def run(_config, loss = 'mean_squared_error' model.compile(loss=loss, optimizer=Adam(learning_rate=learning_rate), - metrics=['accuracy']) + metrics=['accuracy', MeanIoU(n_classes, + name='iou', + ignore_class=0, + sparse_y_true=False, + sparse_y_pred=False)]) # generating train and evaluation data gen_kwargs = dict(batch_size=n_batch, From e85003db4a74d2a0b3f830c0338402368cb67d48 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 4 Feb 2026 17:32:24 +0100 Subject: [PATCH 059/118] training: re-instate `index_start`, reflect cfg dependency - `index_start`: re-introduce cfg key, pass to Keras `Model.fit` as `initial_epoch` - make config keys `index_start` and `dir_of_start_model` dependent on `continue_training` - improve description --- src/eynollah/training/train.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index 344522a..de8cccd 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -157,10 +157,12 @@ def config_params(): dir_eval = None # Directory of validation dataset with subdirectories having the names "images" and "labels". dir_output = None # Directory where the augmented training data and the model checkpoints will be saved. pretraining = False # Set to true to (down)load pretrained weights of ResNet50 encoder. - save_interval = None # frequency for writing model checkpoints (nonzero integer for number of batches, or zero for epoch) - continue_training = False # Set to true if you would like to continue training an already trained a model. - dir_of_start_model = '' # Directory containing pretrained encoder to continue training the model. - data_is_provided = False # Only set this to true when you have already provided the input data and the train and eval data are in "dir_output". + save_interval = None # frequency for writing model checkpoints (positive integer for number of batches saved under "model_step_{batch:04d}", otherwise epoch saved under "model_{epoch:02d}") + continue_training = False # Whether to continue training an existing model. + if continue_training: + dir_of_start_model = '' # Directory of model checkpoint to load to continue training. (E.g. if you already trained for 3 epochs, set "dir_of_start_model=dir_output/model_03".) + index_start = 0 # Epoch counter initial value to continue training. (E.g. if you already trained for 3 epochs, set "index_start=3" to continue naming checkpoints model_04, model_05 etc.) + data_is_provided = False # Whether the preprocessed input data (subdirectories "images" and "labels" in both subdirectories "train" and "eval" of "dir_output") has already been generated (in the first epoch of a previous run). if backbone_type == "transformer": transformer_patchsize_x = None # Patch size of vision transformer patches in x direction. transformer_patchsize_y = None # Patch size of vision transformer patches in y direction. @@ -190,6 +192,7 @@ def run(_config, weight_decay, learning_rate, continue_training, + index_start, dir_of_start_model, save_interval, augmentation, @@ -312,6 +315,7 @@ def run(_config, custom_objects = {"PatchEncoder": PatchEncoder, "Patches": Patches}) else: + index_start = 0 if backbone_type == 'nontransformer': model = resnet50_unet(n_classes, input_height, @@ -410,7 +414,8 @@ def run(_config, #validation_steps=1, # rs: only one batch?? validation_steps=steps_val, epochs=n_epochs, - callbacks=callbacks) + callbacks=callbacks, + initial_epoch=index_start) #os.system('rm -rf '+dir_train_flowing) #os.system('rm -rf '+dir_eval_flowing) From 1581094141a2eb8892fa58b09de7fe8500e73e08 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 4 Feb 2026 17:35:12 +0100 Subject: [PATCH 060/118] training: extend `index_start` to tasks classification and RO --- src/eynollah/training/train.py | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index de8cccd..168884a 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -423,11 +423,15 @@ def run(_config, #model.save(dir_output+'/'+'model'+'.h5') elif task=='classification': - model = resnet50_classifier(n_classes, - input_height, - input_width, - weight_decay, - pretraining) + if continue_training: + model = load_model(dir_of_start_model, compile=False) + else: + index_start = 0 + model = resnet50_classifier(n_classes, + input_height, + input_width, + weight_decay, + pretraining) model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), # rs: why not learning_rate? @@ -453,7 +457,8 @@ def run(_config, verbose=1, epochs=n_epochs, metrics=[F1Score(average='macro', name='f1')], - callbacks=callbacks) + callbacks=callbacks, + initial_epoch=index_start) usable_checkpoints = np.flatnonzero(np.array(history['val_f1']) > f1_threshold_classification) if len(usable_checkpoints) >= 1: @@ -481,8 +486,15 @@ def run(_config, _log.info("ensemble model saved under '%s'", cp_path) elif task=='reading_order': - model = machine_based_reading_order_model( - n_classes, input_height, input_width, weight_decay, pretraining) + if continue_training: + model = load_model(dir_of_start_model, compile=False) + else: + index_start = 0 + model = machine_based_reading_order_model(n_classes, + input_height, + input_width, + weight_decay, + pretraining) dir_flow_train_imgs = os.path.join(dir_train, 'images') dir_flow_train_labels = os.path.join(dir_train, 'labels') @@ -495,7 +507,6 @@ def run(_config, #ls_test = os.listdir(dir_flow_train_labels) #f1score_tot = [0] - indexer_start = 0 model.compile(loss="binary_crossentropy", #optimizer=SGD(learning_rate=0.01, momentum=0.9), optimizer=Adam(learning_rate=0.0001), # rs: why not learning_rate? @@ -515,7 +526,8 @@ def run(_config, steps_per_epoch=num_rows / n_batch, verbose=1, epochs=n_epochs, - callbacks=callbacks) + callbacks=callbacks, + initial_epoch=index_start) ''' if f1score>f1score_tot[0]: f1score_tot[0] = f1score From 7562317da5aa8f4a56c981848d23cb5eec7685d2 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 4 Feb 2026 17:35:38 +0100 Subject: [PATCH 061/118] training: fix+simplify `load_model` logic for `continue_training` - add missing combination `transformer` (w/ patch encoder and `weighted_loss`) - add assertion to prevent wrong loss type being configured --- src/eynollah/training/train.py | 36 ++++++++++++---------------------- 1 file changed, 13 insertions(+), 23 deletions(-) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index 168884a..7ede551 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -290,30 +290,20 @@ def run(_config, weights = weights / float(np.min(weights)) weights = weights / float(np.sum(weights)) + if task == "enhancement": + assert not is_loss_soft_dice, "for enhancement, soft_dice loss does not apply" + assert not weighted_dice, "for enhancement, weighted loss does not apply" if continue_training: - if backbone_type == 'nontransformer': - if is_loss_soft_dice and task in ["segmentation", "binarization"]: - model = load_model(dir_of_start_model, compile=True, - custom_objects={'soft_dice_loss': soft_dice_loss}) - elif weighted_loss and task in ["segmentation", "binarization"]: - model = load_model(dir_of_start_model, compile=True, - custom_objects={'loss': weighted_categorical_crossentropy(weights)}) - else: - model = load_model(dir_of_start_model , compile=True) - - elif backbone_type == 'transformer': - if is_loss_soft_dice and task in ["segmentation", "binarization"]: - model = load_model(dir_of_start_model, compile=True, - custom_objects={"PatchEncoder": PatchEncoder, - "Patches": Patches, - 'soft_dice_loss': soft_dice_loss}) - elif weighted_loss and task in ["segmentation", "binarization"]: - model = load_model(dir_of_start_model, compile=True, - custom_objects={'loss': weighted_categorical_crossentropy(weights)}) - else: - model = load_model(dir_of_start_model, compile=True, - custom_objects = {"PatchEncoder": PatchEncoder, - "Patches": Patches}) + custom_objects = dict() + if is_loss_soft_dice: + custom_objects.update(soft_dice_loss=soft_dice_loss) + elif weighted_loss: + custom_objects.update(loss=weighted_categorical_crossentropy(weights)) + if backbone_type == 'transformer': + custom_objects.update(PatchEncoder=PatchEncoder, + Patches=Patches) + model = load_model(dir_of_start_model, compile=False, + custom_objects=custom_objects) else: index_start = 0 if backbone_type == 'nontransformer': From 4a65ee0c672640821ebb54dc647a3e027f21fc46 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 5 Feb 2026 11:53:19 +0100 Subject: [PATCH 062/118] =?UTF-8?q?training.train:=20more=20config=20depen?= =?UTF-8?q?dencies=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - make more config_params keys dependent on each other - re-order accordingly - in main, initialise them (as kwarg), so sacred actually allows overriding them by named config file --- src/eynollah/training/train.py | 67 ++++++++++++++++++---------------- 1 file changed, 36 insertions(+), 31 deletions(-) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index 7ede551..a21a34d 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -97,7 +97,17 @@ ex = Experiment(save_git_info=False) @ex.config def config_params(): task = "segmentation" # This parameter defines task of model which can be segmentation, enhancement or classification. - backbone_type = None # Type of image feature map network backbone. Either a vision transformer alongside a CNN we call "transformer", or only a CNN which we call "nontransformer" + if task in ["segmentation", "binarization", "enhancement"]: + backbone_type = "nontransformer" # Type of image feature map network backbone. Either a vision transformer alongside a CNN we call "transformer", or only a CNN which we call "nontransformer" + if backbone_type == "transformer": + transformer_patchsize_x = None # Patch size of vision transformer patches in x direction. + transformer_patchsize_y = None # Patch size of vision transformer patches in y direction. + transformer_num_patches_xy = None # Number of patches for vision transformer in x and y direction respectively. + transformer_projection_dim = 64 # Transformer projection dimension. Default value is 64. + transformer_mlp_head_units = [128, 64] # Transformer Multilayer Perceptron (MLP) head units. Default value is [128, 64] + transformer_layers = 8 # transformer layers. Default value is 8. + transformer_num_heads = 4 # Transformer number of heads. Default value is 4. + transformer_cnn_first = True # We have two types of vision transformers: either the CNN is applied first, followed by the transformer, or reversed. n_classes = None # Number of classes. In the case of binary classification this should be 2. n_epochs = 1 # Number of epochs to train. n_batch = 1 # Number of images per batch at each iteration. (Try as large as fits on VRAM.) @@ -105,10 +115,12 @@ def config_params(): input_width = 224 * 1 # Width of model's input in pixels. weight_decay = 1e-6 # Weight decay of l2 regularization of model layers. learning_rate = 1e-4 # Set the learning rate. - is_loss_soft_dice = False # Use soft dice as loss function. When set to true, "weighted_loss" must be false. - weighted_loss = False # Use weighted categorical cross entropy as loss fucntion. When set to true, "is_loss_soft_dice" must be false. - f1_threshold_classification = None # This threshold is used to consider models with an evaluation f1 scores bigger than it. The selected model weights undergo a weights ensembling. And avreage ensembled model will be written to output. - classification_classes_name = None # Dictionary of classification classes names. + if task in ["segmentation", "binarization"]: + is_loss_soft_dice = False # Use soft dice as loss function. When set to true, "weighted_loss" must be false. + weighted_loss = False # Use weighted categorical cross entropy as loss fucntion. When set to true, "is_loss_soft_dice" must be false. + elif task == "classification": + f1_threshold_classification = None # This threshold is used to consider models with an evaluation f1 scores bigger than it. The selected model weights undergo a weights ensembling. And avreage ensembled model will be written to output. + classification_classes_name = None # Dictionary of classification classes names. patches = False # Divides input image into smaller patches (input size of the model) when set to true. For the model to see the full image, like page extraction, set this to false. augmentation = False # To apply any kind of augmentation, this parameter must be set to true. if augmentation: @@ -163,17 +175,8 @@ def config_params(): dir_of_start_model = '' # Directory of model checkpoint to load to continue training. (E.g. if you already trained for 3 epochs, set "dir_of_start_model=dir_output/model_03".) index_start = 0 # Epoch counter initial value to continue training. (E.g. if you already trained for 3 epochs, set "index_start=3" to continue naming checkpoints model_04, model_05 etc.) data_is_provided = False # Whether the preprocessed input data (subdirectories "images" and "labels" in both subdirectories "train" and "eval" of "dir_output") has already been generated (in the first epoch of a previous run). - if backbone_type == "transformer": - transformer_patchsize_x = None # Patch size of vision transformer patches in x direction. - transformer_patchsize_y = None # Patch size of vision transformer patches in y direction. - transformer_num_patches_xy = None # Number of patches for vision transformer in x and y direction respectively. - transformer_projection_dim = 64 # Transformer projection dimension. Default value is 64. - transformer_mlp_head_units = [128, 64] # Transformer Multilayer Perceptron (MLP) head units. Default value is [128, 64] - transformer_layers = 8 # transformer layers. Default value is 8. - transformer_num_heads = 4 # Transformer number of heads. Default value is 4. - transformer_cnn_first = True # We have two types of vision transformers: either the CNN is applied first, followed by the transformer, or reversed. -@ex.automain +@ex.main def run(_config, _log, task, @@ -187,27 +190,29 @@ def run(_config, n_batch, input_height, input_width, - is_loss_soft_dice, - weighted_loss, weight_decay, learning_rate, continue_training, - index_start, - dir_of_start_model, save_interval, augmentation, - thetha, - backbone_type, - transformer_projection_dim, - transformer_mlp_head_units, - transformer_layers, - transformer_num_heads, - transformer_cnn_first, - transformer_patchsize_x, - transformer_patchsize_y, - transformer_num_patches_xy, - f1_threshold_classification, - classification_classes_name, + # dependent config keys need a default, + # otherwise yields sacred.utils.ConfigAddedError + thetha=None, + is_loss_soft_dice=False, + weighted_loss=False, + index_start=0, + dir_of_start_model=None, + backbone_type=None, + transformer_projection_dim=None, + transformer_mlp_head_units=None, + transformer_layers=None, + transformer_num_heads=None, + transformer_cnn_first=None, + transformer_patchsize_x=None, + transformer_patchsize_y=None, + transformer_num_patches_xy=None, + f1_threshold_classification=None, + classification_classes_name=None, ): if pretraining and not os.path.isfile(RESNET50_WEIGHTS_PATH): From 5c7801a1d6273cd88b64548edf41507e5c0235d6 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 5 Feb 2026 11:56:11 +0100 Subject: [PATCH 063/118] training.train: simplify config args for model builder --- src/eynollah/training/models.py | 67 +++++++++++++++++++++++---------- src/eynollah/training/train.py | 33 ++++++++-------- 2 files changed, 63 insertions(+), 37 deletions(-) diff --git a/src/eynollah/training/models.py b/src/eynollah/training/models.py index 011c614..f053447 100644 --- a/src/eynollah/training/models.py +++ b/src/eynollah/training/models.py @@ -400,9 +400,21 @@ def resnet50_unet(n_classes, input_height=224, input_width=224, task="segmentati return model -def vit_resnet50_unet(n_classes, patch_size_x, patch_size_y, num_patches, mlp_head_units=None, transformer_layers=8, num_heads =4, projection_dim = 64, input_height=224, input_width=224, task="segmentation", weight_decay=1e-6, pretraining=False): - if mlp_head_units is None: - mlp_head_units = [128, 64] +def vit_resnet50_unet(num_patches, + n_classes, + transformer_patchsize_x, + transformer_patchsize_y, + transformer_mlp_head_units=None, + transformer_layers=8, + transformer_num_heads=4, + transformer_projection_dim=64, + input_height=224, + input_width=224, + task="segmentation", + weight_decay=1e-6, + pretraining=False): + if transformer_mlp_head_units is None: + transformer_mlp_head_units = [128, 64] inputs = layers.Input(shape=(input_height, input_width, 3)) #transformer_units = [ @@ -449,30 +461,30 @@ def vit_resnet50_unet(n_classes, patch_size_x, patch_size_y, num_patches, mlp_he #num_patches = x.shape[1]*x.shape[2] - #patch_size_y = input_height / x.shape[1] - #patch_size_x = input_width / x.shape[2] - #patch_size = patch_size_x * patch_size_y - patches = Patches(patch_size_x, patch_size_y)(x) + patches = Patches(transformer_patchsize_x, transformer_patchsize_y)(x) # Encode patches. - encoded_patches = PatchEncoder(num_patches, projection_dim)(patches) + encoded_patches = PatchEncoder(num_patches, transformer_projection_dim)(patches) for _ in range(transformer_layers): # Layer normalization 1. x1 = layers.LayerNormalization(epsilon=1e-6)(encoded_patches) # Create a multi-head attention layer. attention_output = layers.MultiHeadAttention( - num_heads=num_heads, key_dim=projection_dim, dropout=0.1 + num_heads=transformer_num_heads, key_dim=transformer_projection_dim, dropout=0.1 )(x1, x1) # Skip connection 1. x2 = layers.Add()([attention_output, encoded_patches]) # Layer normalization 2. x3 = layers.LayerNormalization(epsilon=1e-6)(x2) # MLP. - x3 = mlp(x3, hidden_units=mlp_head_units, dropout_rate=0.1) + x3 = mlp(x3, hidden_units=transformer_mlp_head_units, dropout_rate=0.1) # Skip connection 2. encoded_patches = layers.Add()([x3, x2]) - encoded_patches = tf.reshape(encoded_patches, [-1, x.shape[1], x.shape[2] , int( projection_dim / (patch_size_x * patch_size_y) )]) + encoded_patches = tf.reshape(encoded_patches, + [-1, x.shape[1], x.shape[2], + transformer_projection_dim // (transformer_patchsize_x * + transformer_patchsize_y)]) v1024_2048 = Conv2D( 1024 , (1, 1), padding='same', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay))(encoded_patches) v1024_2048 = (BatchNormalization(axis=bn_axis))(v1024_2048) @@ -524,9 +536,21 @@ def vit_resnet50_unet(n_classes, patch_size_x, patch_size_y, num_patches, mlp_he return model -def vit_resnet50_unet_transformer_before_cnn(n_classes, patch_size_x, patch_size_y, num_patches, mlp_head_units=None, transformer_layers=8, num_heads =4, projection_dim = 64, input_height=224, input_width=224, task="segmentation", weight_decay=1e-6, pretraining=False): - if mlp_head_units is None: - mlp_head_units = [128, 64] +def vit_resnet50_unet_transformer_before_cnn(num_patches, + n_classes, + transformer_patchsize_x, + transformer_patchsize_y, + transformer_mlp_head_units=None, + transformer_layers=8, + transformer_num_heads=4, + transformer_projection_dim=64, + input_height=224, + input_width=224, + task="segmentation", + weight_decay=1e-6, + pretraining=False): + if transformer_mlp_head_units is None: + transformer_mlp_head_units = [128, 64] inputs = layers.Input(shape=(input_height, input_width, 3)) ##transformer_units = [ @@ -536,27 +560,32 @@ def vit_resnet50_unet_transformer_before_cnn(n_classes, patch_size_x, patch_size IMAGE_ORDERING = 'channels_last' bn_axis=3 - patches = Patches(patch_size_x, patch_size_y)(inputs) + patches = Patches(transformer_patchsize_x, transformer_patchsize_y)(inputs) # Encode patches. - encoded_patches = PatchEncoder(num_patches, projection_dim)(patches) + encoded_patches = PatchEncoder(num_patches, transformer_projection_dim)(patches) for _ in range(transformer_layers): # Layer normalization 1. x1 = layers.LayerNormalization(epsilon=1e-6)(encoded_patches) # Create a multi-head attention layer. attention_output = layers.MultiHeadAttention( - num_heads=num_heads, key_dim=projection_dim, dropout=0.1 + num_heads=transformer_num_heads, key_dim=transformer_projection_dim, dropout=0.1 )(x1, x1) # Skip connection 1. x2 = layers.Add()([attention_output, encoded_patches]) # Layer normalization 2. x3 = layers.LayerNormalization(epsilon=1e-6)(x2) # MLP. - x3 = mlp(x3, hidden_units=mlp_head_units, dropout_rate=0.1) + x3 = mlp(x3, hidden_units=transformer_mlp_head_units, dropout_rate=0.1) # Skip connection 2. encoded_patches = layers.Add()([x3, x2]) - encoded_patches = tf.reshape(encoded_patches, [-1, input_height, input_width , int( projection_dim / (patch_size_x * patch_size_y) )]) + encoded_patches = tf.reshape(encoded_patches, + [-1, + input_height, + input_width, + transformer_projection_dim // (transformer_patchsize_x * + transformer_patchsize_y)]) encoded_patches = Conv2D(3, (1, 1), padding='same', data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay), name='convinput')(encoded_patches) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index a21a34d..4aafcf2 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -38,6 +38,7 @@ from tensorflow.keras.metrics import MeanIoU from tensorflow.keras.models import load_model from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard from sacred import Experiment +from sacred.config import create_captured_function from tqdm import tqdm from sklearn.metrics import f1_score @@ -318,7 +319,7 @@ def run(_config, task, weight_decay, pretraining) - elif backbone_type == 'transformer': + else: num_patches_x = transformer_num_patches_xy[0] num_patches_y = transformer_num_patches_xy[1] num_patches = num_patches_x * num_patches_y @@ -330,35 +331,31 @@ def run(_config, model_builder = vit_resnet50_unet_transformer_before_cnn multiple_of_32 = False - assert input_height == num_patches_y * transformer_patchsize_y * (32 if multiple_of_32 else 1), \ + assert input_height == (num_patches_y * + transformer_patchsize_y * + (32 if multiple_of_32 else 1)), \ "transformer_patchsize_y or transformer_num_patches_xy height value error: " \ "input_height should be equal to " \ "(transformer_num_patches_xy height value * transformer_patchsize_y%s)" % \ " * 32" if multiple_of_32 else "" - assert input_width == num_patches_x * transformer_patchsize_x * (32 if multiple_of_32 else 1), \ + assert input_width == (num_patches_x * + transformer_patchsize_x * + (32 if multiple_of_32 else 1)), \ "transformer_patchsize_x or transformer_num_patches_xy width value error: " \ "input_width should be equal to " \ "(transformer_num_patches_xy width value * transformer_patchsize_x%s)" % \ " * 32" if multiple_of_32 else "" - assert 0 == transformer_projection_dim % (transformer_patchsize_y * transformer_patchsize_x), \ + assert 0 == (transformer_projection_dim % + (transformer_patchsize_y * + transformer_patchsize_x)), \ "transformer_projection_dim error: " \ "The remainder when parameter transformer_projection_dim is divided by " \ "(transformer_patchsize_y*transformer_patchsize_x) should be zero" - model = model_builder( - n_classes, - transformer_patchsize_x, - transformer_patchsize_y, - num_patches, - transformer_mlp_head_units, - transformer_layers, - transformer_num_heads, - transformer_projection_dim, - input_height, - input_width, - task, - weight_decay, - pretraining) + model_builder = create_captured_function(model_builder) + model_builder.config = _config + model_builder.logger = _log + model = model_builder(num_patches) #if you want to see the model structure just uncomment model summary. #model.summary() From 82d649061a7d932df25828081c01b25a6acae012 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 5 Feb 2026 11:57:38 +0100 Subject: [PATCH 064/118] training.train: fix F1 metric score setup --- src/eynollah/training/train.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index 4aafcf2..effc920 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -34,7 +34,7 @@ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' os.environ['TF_USE_LEGACY_KERAS'] = '1' # avoid Keras 3 after TF 2.15 import tensorflow as tf from tensorflow.keras.optimizers import SGD, Adam -from tensorflow.keras.metrics import MeanIoU +from tensorflow.keras.metrics import MeanIoU, F1Score from tensorflow.keras.models import load_model from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard from sacred import Experiment @@ -427,8 +427,8 @@ def run(_config, model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), # rs: why not learning_rate? - metrics=['accuracy']) - + metrics=['accuracy', F1Score(average='macro', name='f1')]) + list_classes = list(classification_classes_name.values()) trainXY = generate_data_from_folder_training( dir_train, n_batch, input_height, input_width, n_classes, list_classes) @@ -440,7 +440,8 @@ def run(_config, callbacks = [TensorBoard(os.path.join(dir_output, 'logs'), write_graph=False), SaveWeightsAfterSteps(0, dir_output, _config, monitor='val_f1', - save_best_only=True, mode='max')] + #save_best_only=True, # we need all for ensembling + mode='max')] history = model.fit(trainXY, steps_per_epoch=num_rows / n_batch, @@ -448,17 +449,17 @@ def run(_config, validation_data=testXY, verbose=1, epochs=n_epochs, - metrics=[F1Score(average='macro', name='f1')], callbacks=callbacks, initial_epoch=index_start) - usable_checkpoints = np.flatnonzero(np.array(history['val_f1']) > f1_threshold_classification) + usable_checkpoints = np.flatnonzero(np.array(history.history['val_f1']) > + f1_threshold_classification) if len(usable_checkpoints) >= 1: _log.info("averaging over usable checkpoints: %s", str(usable_checkpoints)) all_weights = [] for epoch in usable_checkpoints: - cp_path = os.path.join(dir_output, 'model_{epoch:02d}'.format(epoch=epoch)) - assert os.path.isdir(cp_path) + cp_path = os.path.join(dir_output, 'model_{epoch:02d}'.format(epoch=epoch + 1)) + assert os.path.isdir(cp_path), cp_path model = load_model(cp_path, compile=False) all_weights.append(model.get_weights()) From f03124f747db7edef03d968e1b10db0e7638850d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 5 Feb 2026 11:58:50 +0100 Subject: [PATCH 065/118] =?UTF-8?q?training.train:=20simplify+fix=20classi?= =?UTF-8?q?fication=20data=20loaders=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - unify `generate_data_from_folder_training` w/ `..._evaluation` - instead of recreating array after every batch, just zero out - cast image results to uint8 instead of uint16 - cast categorical results to float instead of int --- src/eynollah/training/train.py | 15 ++++--- src/eynollah/training/utils.py | 78 ++++++++-------------------------- 2 files changed, 25 insertions(+), 68 deletions(-) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index effc920..0f8d0e9 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -430,13 +430,13 @@ def run(_config, metrics=['accuracy', F1Score(average='macro', name='f1')]) list_classes = list(classification_classes_name.values()) - trainXY = generate_data_from_folder_training( - dir_train, n_batch, input_height, input_width, n_classes, list_classes) - testXY = generate_data_from_folder_evaluation( - dir_eval, input_height, input_width, n_classes, list_classes) + trainXY = generate_data_from_folder( + dir_train, n_batch, input_height, input_width, n_classes, list_classes, shuffle=True) + testXY = generate_data_from_folder( + dir_eval, n_batch, input_height, input_width, n_classes, list_classes) + epoch_size_train = return_number_of_total_training_data(dir_train) + epoch_size_eval = return_number_of_total_training_data(dir_eval) - y_tot = np.zeros((testX.shape[0], n_classes)) - num_rows = return_number_of_total_training_data(dir_train) callbacks = [TensorBoard(os.path.join(dir_output, 'logs'), write_graph=False), SaveWeightsAfterSteps(0, dir_output, _config, monitor='val_f1', @@ -444,9 +444,10 @@ def run(_config, mode='max')] history = model.fit(trainXY, - steps_per_epoch=num_rows / n_batch, + steps_per_epoch=epoch_size_train // n_batch, #class_weight=weights) validation_data=testXY, + validation_steps=epoch_size_eval // n_batch, verbose=1, epochs=n_epochs, callbacks=callbacks, diff --git a/src/eynollah/training/utils.py b/src/eynollah/training/utils.py index 61b2536..5b25a4f 100644 --- a/src/eynollah/training/utils.py +++ b/src/eynollah/training/utils.py @@ -166,50 +166,7 @@ def return_number_of_total_training_data(path_classes): -def generate_data_from_folder_evaluation(path_classes, height, width, n_classes, list_classes): - #sub_classes = os.listdir(path_classes) - #n_classes = len(sub_classes) - all_imgs = [] - labels = [] - #dicts =dict() - #indexer= 0 - for indexer, sub_c in enumerate(list_classes): - sub_files = os.listdir(os.path.join(path_classes,sub_c )) - sub_files = [os.path.join(path_classes,sub_c )+'/' + x for x in sub_files] - #print( os.listdir(os.path.join(path_classes,sub_c )) ) - all_imgs = all_imgs + sub_files - sub_labels = list( np.zeros( len(sub_files) ) +indexer ) - - #print( len(sub_labels) ) - labels = labels + sub_labels - #dicts[sub_c] = indexer - #indexer +=1 - - - categories = to_categorical(range(n_classes)).astype(np.int16)#[ [1 , 0, 0 , 0 , 0 , 0] , [0 , 1, 0 , 0 , 0 , 0] , [0 , 0, 1 , 0 , 0 , 0] , [0 , 0, 0 , 1 , 0 , 0] , [0 , 0, 0 , 0 , 1 , 0] , [0 , 0, 0 , 0 , 0 , 1] ] - ret_x= np.zeros((len(labels), height,width, 3)).astype(np.int16) - ret_y= np.zeros((len(labels), n_classes)).astype(np.int16) - - #print(all_imgs) - for i in range(len(all_imgs)): - row = all_imgs[i] - #####img = cv2.imread(row, 0) - #####img= resize_image (img, height, width) - #####img = img.astype(np.uint16) - #####ret_x[i, :,:,0] = img[:,:] - #####ret_x[i, :,:,1] = img[:,:] - #####ret_x[i, :,:,2] = img[:,:] - - img = cv2.imread(row) - img= resize_image (img, height, width) - img = img.astype(np.uint16) - ret_x[i, :,:] = img[:,:,:] - - ret_y[i, :] = categories[ int( labels[i] ) ][:] - - return ret_x/255., ret_y - -def generate_data_from_folder_training(path_classes, batchsize, height, width, n_classes, list_classes): +def generate_data_from_folder(path_classes, batchsize, height, width, n_classes, list_classes, shuffle=False): #sub_classes = os.listdir(path_classes) #n_classes = len(sub_classes) @@ -228,43 +185,42 @@ def generate_data_from_folder_training(path_classes, batchsize, height, width, n labels = labels + sub_labels #dicts[sub_c] = indexer #indexer +=1 - - ids = np.array(range(len(labels))) - random.shuffle(ids) - - shuffled_labels = np.array(labels)[ids] - shuffled_files = np.array(all_imgs)[ids] + + if shuffle: + ids = np.array(range(len(labels))) + random.shuffle(ids) + labels = np.array(labels)[ids] + all_imgs = np.array(all_imgs)[ids] + categories = to_categorical(range(n_classes)).astype(np.int16)#[ [1 , 0, 0 , 0 , 0 , 0] , [0 , 1, 0 , 0 , 0 , 0] , [0 , 0, 1 , 0 , 0 , 0] , [0 , 0, 0 , 1 , 0 , 0] , [0 , 0, 0 , 0 , 1 , 0] , [0 , 0, 0 , 0 , 0 , 1] ] - ret_x= np.zeros((batchsize, height,width, 3)).astype(np.int16) - ret_y= np.zeros((batchsize, n_classes)).astype(np.int16) + ret_x= np.zeros((batchsize, height,width, 3)).astype(np.uint8) + ret_y= np.zeros((batchsize, n_classes)).astype(float) batchcount = 0 while True: - for i in range(len(shuffled_files)): - row = shuffled_files[i] - #print(row) - ###img = cv2.imread(row, 0) + for lab, img in zip(labels, all_imgs): + ###img = cv2.imread(img, 0) ###img= resize_image (img, height, width) ###img = img.astype(np.uint16) ###ret_x[batchcount, :,:,0] = img[:,:] ###ret_x[batchcount, :,:,1] = img[:,:] ###ret_x[batchcount, :,:,2] = img[:,:] - img = cv2.imread(row) + img = cv2.imread(img) img= resize_image (img, height, width) img = img.astype(np.uint16) ret_x[batchcount, :,:,:] = img[:,:,:] #print(int(shuffled_labels[i]) ) #print( categories[int(shuffled_labels[i])] ) - ret_y[batchcount, :] = categories[ int( shuffled_labels[i] ) ][:] + ret_y[batchcount, :] = categories[int(lab)][:] batchcount+=1 if batchcount>=batchsize: - ret_x = ret_x/255. + ret_x = ret_x//255 yield ret_x, ret_y - ret_x= np.zeros((batchsize, height,width, 3)).astype(np.int16) - ret_y= np.zeros((batchsize, n_classes)).astype(np.int16) + ret_x[:] = 0 + ret_y[:] = 0 batchcount = 0 def do_brightening(img, factor): From 5d0c26b629dc0f7368c7d2058a2efbd0ac27a911 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 5 Feb 2026 12:02:58 +0100 Subject: [PATCH 066/118] training.train: use std Keras data loader for classification (much more efficient, works with std F1 metric) --- src/eynollah/training/train.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index 0f8d0e9..7cf7536 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -23,8 +23,6 @@ from eynollah.training.models import ( from eynollah.training.utils import ( data_gen, generate_arrays_from_folder_reading_order, - generate_data_from_folder_evaluation, - generate_data_from_folder_training, get_one_hot, preprocess_imgs, return_number_of_total_training_data @@ -37,6 +35,7 @@ from tensorflow.keras.optimizers import SGD, Adam from tensorflow.keras.metrics import MeanIoU, F1Score from tensorflow.keras.models import load_model from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard +from tensorflow.keras.utils import image_dataset_from_directory from sacred import Experiment from sacred.config import create_captured_function from tqdm import tqdm @@ -430,13 +429,13 @@ def run(_config, metrics=['accuracy', F1Score(average='macro', name='f1')]) list_classes = list(classification_classes_name.values()) - trainXY = generate_data_from_folder( - dir_train, n_batch, input_height, input_width, n_classes, list_classes, shuffle=True) - testXY = generate_data_from_folder( - dir_eval, n_batch, input_height, input_width, n_classes, list_classes) - epoch_size_train = return_number_of_total_training_data(dir_train) - epoch_size_eval = return_number_of_total_training_data(dir_eval) - + data_args = dict(label_mode="categorical", + class_names=list_classes, + batch_size=n_batch, + image_size=(input_height, input_width), + interpolation="nearest") + trainXY = image_dataset_from_directory(dir_train, shuffle=True, **data_args) + testXY = image_dataset_from_directory(dir_eval, shuffle=False, **data_args) callbacks = [TensorBoard(os.path.join(dir_output, 'logs'), write_graph=False), SaveWeightsAfterSteps(0, dir_output, _config, monitor='val_f1', @@ -444,10 +443,8 @@ def run(_config, mode='max')] history = model.fit(trainXY, - steps_per_epoch=epoch_size_train // n_batch, #class_weight=weights) validation_data=testXY, - validation_steps=epoch_size_eval // n_batch, verbose=1, epochs=n_epochs, callbacks=callbacks, From b1633dfc7cf9cdfd84586b2fe367a8bd239fc2cf Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 5 Feb 2026 14:53:26 +0100 Subject: [PATCH 067/118] training.generate_gt: for RO, skip files if regionRefs are missing --- .../training/generate_gt_for_training.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/eynollah/training/generate_gt_for_training.py b/src/eynollah/training/generate_gt_for_training.py index 693cab8..f71614c 100644 --- a/src/eynollah/training/generate_gt_for_training.py +++ b/src/eynollah/training/generate_gt_for_training.py @@ -205,14 +205,20 @@ def machine_based_reading_order(dir_xml, dir_out_modal_image, dir_out_classes, i img_header_and_sep = np.zeros((y_len,x_len), dtype='uint8') for j in range(len(cy_main)): - img_header_and_sep[int(y_max_main[j]):int(y_max_main[j])+12,int(x_min_main[j]):int(x_max_main[j]) ] = 1 + img_header_and_sep[int(y_max_main[j]):int(y_max_main[j])+12, + int(x_min_main[j]):int(x_max_main[j]) ] = 1 - texts_corr_order_index = [index_tot_regions[tot_region_ref.index(i)] for i in id_all_text ] - texts_corr_order_index_int = [int(x) for x in texts_corr_order_index] - + try: + texts_corr_order_index_int = [int(index_tot_regions[tot_region_ref.index(i)]) + for i in id_all_text] + except ValueError as e: + print("incomplete ReadingOrder in", xml_file, "- skipping:", str(e)) + continue - co_text_all, texts_corr_order_index_int, regions_ar_less_than_early_min = filter_contours_area_of_image(img_poly, co_text_all, texts_corr_order_index_int, max_area, min_area, min_area_early) + co_text_all, texts_corr_order_index_int, regions_ar_less_than_early_min = \ + filter_contours_area_of_image(img_poly, co_text_all, texts_corr_order_index_int, + max_area, min_area, min_area_early) arg_array = np.array(range(len(texts_corr_order_index_int))) From 0d3a8eacba67f6fc6b8bec0fbe6ea12d4d1b948f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 5 Feb 2026 14:54:08 +0100 Subject: [PATCH 068/118] improve/update docs/train.md --- docs/train.md | 110 +++++++++++++++++++++++++++++--------------------- 1 file changed, 64 insertions(+), 46 deletions(-) diff --git a/docs/train.md b/docs/train.md index 4e76740..3c64ab9 100644 --- a/docs/train.md +++ b/docs/train.md @@ -343,51 +343,17 @@ The following parameter configuration can be applied to all segmentation use cas its sub-parameters, and continued training are defined only for segmentation use cases and enhancements, not for classification and machine-based reading order, as you can see in their example config files. -* `backbone_type`: For segmentation tasks (such as text line, binarization, and layout detection) and enhancement, we - offer two backbone options: a "nontransformer" and a "transformer" backbone. For the "transformer" backbone, we first - apply a CNN followed by a transformer. In contrast, the "nontransformer" backbone utilizes only a CNN ResNet-50. -* `task`: The task parameter can have values such as "segmentation", "enhancement", "classification", and "reading_order". -* `patches`: If you want to break input images into smaller patches (input size of the model) you need to set this -* parameter to `true`. In the case that the model should see the image once, like page extraction, patches should be - set to ``false``. -* `n_batch`: Number of batches at each iteration. -* `n_classes`: Number of classes. In the case of binary classification this should be 2. In the case of reading_order it - should set to 1. And for the case of layout detection just the unique number of classes should be given. -* `n_epochs`: Number of epochs. -* `input_height`: This indicates the height of model's input. -* `input_width`: This indicates the width of model's input. -* `weight_decay`: Weight decay of l2 regularization of model layers. -* `pretraining`: Set to `true` to load pretrained weights of ResNet50 encoder. The downloaded weights should be saved - in a folder named "pretrained_model" in the same directory of "train.py" script. -* `augmentation`: If you want to apply any kind of augmentation this parameter should first set to `true`. -* `flip_aug`: If `true`, different types of filp will be applied on image. Type of flips is given with "flip_index" parameter. -* `blur_aug`: If `true`, different types of blurring will be applied on image. Type of blurrings is given with "blur_k" parameter. -* `scaling`: If `true`, scaling will be applied on image. Scale of scaling is given with "scales" parameter. -* `degrading`: If `true`, degrading will be applied to the image. The amount of degrading is defined with "degrade_scales" parameter. -* `brightening`: If `true`, brightening will be applied to the image. The amount of brightening is defined with "brightness" parameter. -* `rotation_not_90`: If `true`, rotation (not 90 degree) will be applied on image. Rotation angles are given with "thetha" parameter. -* `rotation`: If `true`, 90 degree rotation will be applied on image. -* `binarization`: If `true`,Otsu thresholding will be applied to augment the input data with binarized images. -* `scaling_bluring`: If `true`, combination of scaling and blurring will be applied on image. -* `scaling_binarization`: If `true`, combination of scaling and binarization will be applied on image. -* `scaling_flip`: If `true`, combination of scaling and flip will be applied on image. -* `flip_index`: Type of flips. -* `blur_k`: Type of blurrings. -* `scales`: Scales of scaling. -* `brightness`: The amount of brightenings. -* `thetha`: Rotation angles. -* `degrade_scales`: The amount of degradings. -* `continue_training`: If `true`, it means that you have already trained a model and you would like to continue the - training. So it is needed to providethe dir of trained model with "dir_of_start_model" and index for naming - themodels. For example if you have already trained for 3 epochs then your lastindex is 2 and if you want to continue - from model_1.h5, you can set `index_start` to 3 to start naming model with index 3. -* `weighted_loss`: If `true`, this means that you want to apply weighted categorical_crossentropy as loss fucntion. Be carefull if you set to `true`the parameter "is_loss_soft_dice" should be ``false`` -* `data_is_provided`: If you have already provided the input data you can set this to `true`. Be sure that the train - and eval data are in"dir_output".Since when once we provide training data we resize and augmentthem and then wewrite - them in sub-directories train and eval in "dir_output". -* `dir_train`: This is the directory of "images" and "labels" (dir_train should include two subdirectories with names of images and labels ) for raw images and labels. Namely they are not prepared (not resized and not augmented) yet for training the model. When we run this tool these raw data will be transformed to suitable size needed for the model and they will be written in "dir_output" in train and eval directories. Each of train and eval include "images" and "labels" sub-directories. -* `index_start`: Starting index for saved models in the case that "continue_training" is `true`. -* `dir_of_start_model`: Directory containing pretrained model to continue training the model in the case that "continue_training" is `true`. +* `task`: The task parameter must be one of the following values: + - `binarization`, + - `enhancement`, + - `segmentation`, + - `classification`, + - `reading_order`. +* `backbone_type`: For the tasks `segmentation` (such as text line, and region layout detection), + `binarization` and `enhancement`, we offer two backbone options: + - `nontransformer` (only a CNN ResNet-50). + - `transformer` (first apply a CNN, followed by a transformer) +* `transformer_cnn_first`: Whether to apply the CNN first (followed by the transformer) when using `transformer` backbone. * `transformer_num_patches_xy`: Number of patches for vision transformer in x and y direction respectively. * `transformer_patchsize_x`: Patch size of vision transformer patches in x direction. * `transformer_patchsize_y`: Patch size of vision transformer patches in y direction. @@ -395,7 +361,59 @@ classification and machine-based reading order, as you can see in their example * `transformer_mlp_head_units`: Transformer Multilayer Perceptron (MLP) head units. Default value is [128, 64]. * `transformer_layers`: transformer layers. Default value is 8. * `transformer_num_heads`: Transformer number of heads. Default value is 4. -* `transformer_cnn_first`: We have two types of vision transformers. In one type, a CNN is applied first, followed by a transformer. In the other type, this order is reversed. If transformer_cnn_first is true, it means the CNN will be applied before the transformer. Default value is true. +* `patches`: Whether to break up (tile) input images into smaller patches (input size of the model). + If `false`, the model will see the image once (resized to the input size of the model). + Should be set to `false` for cases like page extraction. +* `n_batch`: Number of batches at each iteration. +* `n_classes`: Number of classes. In the case of binary classification this should be 2. In the case of reading_order it + should set to 1. And for the case of layout detection just the unique number of classes should be given. +* `n_epochs`: Number of epochs (iterations over the data) to train. +* `input_height`: the image height for the model's input. +* `input_width`: the image width for the model's input. +* `weight_decay`: Weight decay of l2 regularization of model layers. +* `weighted_loss`: If `true`, this means that you want to apply weighted categorical crossentropy as loss function. + (Mutually exclusive with `is_loss_soft_dice`, and only applies for `segmentation` and `binarization` tasks.) +* `pretraining`: Set to `true` to (download and) initialise pretrained weights of ResNet50 encoder. +* `dir_train`: Path to directory of raw training data (as extracted via `pagexml2labels`, i.e. with subdirectories + `images` and `labels` for input images and output labels. + (These are not prepared for training the model, yet. Upon first run, the raw data will be transformed to suitable size + needed for the model, and written in `dir_output` under `train` and `eval` subdirectories. See `data_is_provided`.) +* `dir_eval`: Ditto for raw evaluation data. +* `dir_output`: Directory to write model checkpoints, logs (for Tensorboard) and precomputed images to. +* `data_is_provided`: If you have already trained at least one complete epoch (using the same data settings) before, + you can set this to `true` to avoid computing the resized / patched / augmented image files again. + Be sure that there are subdirectories `train` and `eval` data are in `dir_output` (each with subdirectories `images` + and `labels`, respectively). +* `continue_training`: If `true`, continue training a model checkpoint from a previous run. + This requires providing the directory of the model checkpoint to load via `dir_of_start_model` + and setting `index_start` counter for naming new checkpoints. + For example if you have already trained for 3 epochs, then your last index is 2, so if you want + to continue with `model_04`, `model_05` etc., set `index_start=3`. +* `index_start`: Starting index for saving models in the case that `continue_training` is `true`. + (Existing checkpoints above this will be overwritten.) +* `dir_of_start_model`: Directory containing existing model checkpoint to initialise model weights from when `continue_training=true`. + (Can be an epoch-interval checkpoint, or batch-interval checkpoint from `save_interval`.) +* `augmentation`: If you want to apply any kind of augmentation this parameter should first set to `true`. + The remaining settings pertain to that... +* `flip_aug`: If `true`, different types of flipping over the image arrays. Requires `flip_index` parameter. +* `flip_index`: List of flip codes (as in `cv2.flip`, i.e. 0 for vertical, positive for horizontal shift, negative for vertical and horizontal shift). +* `blur_aug`: If `true`, different types of blurring will be applied on image. Requires `blur_k` parameter. +* `blur_k`: Method of blurring (`gauss`, `median` or `blur`). +* `scaling`: If `true`, scaling will be applied on image. Requires `scales` parameter. +* `scales`: List of scale factors for scaling. +* `scaling_bluring`: If `true`, combination of scaling and blurring will be applied on image. +* `scaling_binarization`: If `true`, combination of scaling and binarization will be applied on image. +* `scaling_flip`: If `true`, combination of scaling and flip will be applied on image. +* `degrading`: If `true`, degrading will be applied to the image. Requires `degrade_scales` parameter. +* `degrade_scales`: List of intensity factors for degrading. +* `brightening`: If `true`, brightening will be applied to the image. Requires `brightness` parameter. +* `brightness`: List of intensity factors for brightening. +* `binarization`: If `true`, Otsu thresholding will be applied to augment the input data with binarized images. +* `dir_img_bin`: With `binarization`, use this directory to read precomputed binarized images instead of ad-hoc Otsu. + (Base names should correspond to the files in `dir_train/images`.) +* `rotation`: If `true`, 90° rotation will be applied on images. +* `rotation_not_90`: If `true`, random rotation (other than 90°) will be applied on image. Requires `thetha` parameter. +* `thetha`: List of rotation angles (in degrees). In case of segmentation and enhancement the train and evaluation data should be organised as follows. From 6944d3161717bbe1a821ba50658fcf6aae4ba9ac Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 5 Feb 2026 17:58:32 +0100 Subject: [PATCH 069/118] =?UTF-8?q?modify=20manual=20RO=20preference?= =?UTF-8?q?=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit in `return_boxes_of_images_by_order_of_reading_new`, when the next multicol separator ends in the same column, do not recurse into subspan if the next starts earlier (but continue with top span to the right first) --- src/eynollah/utils/__init__.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py index 4e55aef..b839385 100644 --- a/src/eynollah/utils/__init__.py +++ b/src/eynollah/utils/__init__.py @@ -1881,7 +1881,10 @@ def return_boxes_of_images_by_order_of_reading_new( y_mid[nxt]]) # dbg_plt(boxes[-1], "recursive column %d:%d box [%d]" % (column, last, len(boxes))) column = last - if last == x_ending[nxt] and x_ending[nxt] <= x_ending[cur] and nxt in args: + if (last == x_ending[nxt] and + x_ending[nxt] <= x_ending[cur] and + x_starting[nxt] >= x_starting[cur] and + nxt in args): # child – recur # print("recur", nxt, y_mid[nxt], "%d:%d" % (x_starting[nxt], x_ending[nxt])) args.remove(nxt) From bd282a594d7dac9adcbcce55b09fbd1e1a7f85a9 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 7 Feb 2026 16:34:55 +0100 Subject: [PATCH 070/118] training follow-up: - use relative imports - use tf.keras everywhere (and ensure v2) - `weights_ensembling`: * use `Patches` and `PatchEncoder` from .models * drop TF1 stuff * make function / CLI more flexible (expect list of checkpoint dirs instead of single top-level directory) - train for `classification`: delegate to `weights_ensembling.run_ensembling` --- src/eynollah/eynollah_imports.py | 3 + src/eynollah/training/cli.py | 2 +- .../training/generate_gt_for_training.py | 14 +- src/eynollah/training/inference.py | 4 +- src/eynollah/training/train.py | 116 ++++++------- src/eynollah/training/weights_ensembling.py | 156 +++++------------- 6 files changed, 112 insertions(+), 183 deletions(-) diff --git a/src/eynollah/eynollah_imports.py b/src/eynollah/eynollah_imports.py index f04cfdc..496406c 100644 --- a/src/eynollah/eynollah_imports.py +++ b/src/eynollah/eynollah_imports.py @@ -1,6 +1,9 @@ """ Load libraries with possible race conditions once. This must be imported as the first module of eynollah. """ +import os +os.environ['TF_USE_LEGACY_KERAS'] = '1' # avoid Keras 3 after TF 2.15 + from ocrd_utils import tf_disable_interactive_logs from torch import * tf_disable_interactive_logs() diff --git a/src/eynollah/training/cli.py b/src/eynollah/training/cli.py index 3718275..ae14f04 100644 --- a/src/eynollah/training/cli.py +++ b/src/eynollah/training/cli.py @@ -9,7 +9,7 @@ from .generate_gt_for_training import main as generate_gt_cli from .inference import main as inference_cli from .train import ex from .extract_line_gt import linegt_cli -from .weights_ensembling import main as ensemble_cli +from .weights_ensembling import ensemble_cli @click.command(context_settings=dict( ignore_unknown_options=True, diff --git a/src/eynollah/training/generate_gt_for_training.py b/src/eynollah/training/generate_gt_for_training.py index 2c076d3..2422cc2 100644 --- a/src/eynollah/training/generate_gt_for_training.py +++ b/src/eynollah/training/generate_gt_for_training.py @@ -7,7 +7,7 @@ from PIL import Image, ImageDraw, ImageFont import cv2 import numpy as np -from eynollah.training.gt_gen_utils import ( +from .gt_gen_utils import ( filter_contours_area_of_image, find_format_of_given_filename_in_dir, find_new_features_of_contours, @@ -26,6 +26,9 @@ from eynollah.training.gt_gen_utils import ( @click.group() def main(): + """ + extract GT data suitable for model training for various tasks + """ pass @main.command() @@ -74,6 +77,9 @@ def main(): ) def pagexml2label(dir_xml,dir_out,type_output,config, printspace, dir_images, dir_out_images): + """ + extract PAGE-XML GT data suitable for model training for segmentation tasks + """ if config: with open(config) as f: config_params = json.load(f) @@ -110,6 +116,9 @@ def pagexml2label(dir_xml,dir_out,type_output,config, printspace, dir_images, di type=click.Path(exists=True, dir_okay=False), ) def image_enhancement(dir_imgs, dir_out_images, dir_out_labels, scales): + """ + extract image GT data suitable for model training for image enhancement tasks + """ ls_imgs = os.listdir(dir_imgs) with open(scales) as f: scale_dict = json.load(f) @@ -175,6 +184,9 @@ def image_enhancement(dir_imgs, dir_out_images, dir_out_labels, scales): ) def machine_based_reading_order(dir_xml, dir_out_modal_image, dir_out_classes, input_height, input_width, min_area_size, min_area_early): + """ + extract PAGE-XML GT data suitable for model training for reading-order task + """ xml_files_ind = os.listdir(dir_xml) xml_files_ind = [ind_xml for ind_xml in xml_files_ind if ind_xml.endswith('.xml')] input_height = int(input_height) diff --git a/src/eynollah/training/inference.py b/src/eynollah/training/inference.py index 454c689..2b26210 100644 --- a/src/eynollah/training/inference.py +++ b/src/eynollah/training/inference.py @@ -33,9 +33,9 @@ from .metrics import ( soft_dice_loss, weighted_categorical_crossentropy, ) +from.utils import scale_padd_image_for_ocr +from ..utils.utils_ocr import decode_batch_predictions -from.utils import (scale_padd_image_for_ocr) -from eynollah.utils.utils_ocr import (decode_batch_predictions) with warnings.catch_warnings(): warnings.simplefilter("ignore") diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index 61dbdf7..217ab35 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -3,32 +3,8 @@ import sys import json import requests -import click -from eynollah.training.metrics import ( - soft_dice_loss, - weighted_categorical_crossentropy -) -from eynollah.training.models import ( - PatchEncoder, - Patches, - machine_based_reading_order_model, - resnet50_classifier, - resnet50_unet, - vit_resnet50_unet, - vit_resnet50_unet_transformer_before_cnn, - cnn_rnn_ocr_model, - RESNET50_WEIGHTS_PATH, - RESNET50_WEIGHTS_URL -) -from eynollah.training.utils import ( - data_gen, - generate_arrays_from_folder_reading_order, - get_one_hot, - preprocess_imgs, -) - -os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' os.environ['TF_USE_LEGACY_KERAS'] = '1' # avoid Keras 3 after TF 2.15 import tensorflow as tf from tensorflow.keras.optimizers import SGD, Adam @@ -43,6 +19,31 @@ from sacred.config import create_captured_function import numpy as np import cv2 +from .metrics import ( + soft_dice_loss, + weighted_categorical_crossentropy +) +from .models import ( + PatchEncoder, + Patches, + machine_based_reading_order_model, + resnet50_classifier, + resnet50_unet, + vit_resnet50_unet, + vit_resnet50_unet_transformer_before_cnn, + cnn_rnn_ocr_model, + RESNET50_WEIGHTS_PATH, + RESNET50_WEIGHTS_URL +) +from .utils import ( + data_gen, + generate_arrays_from_folder_reading_order, + get_one_hot, + preprocess_imgs, +) +from .weights_ensembling import run_ensembling + + class SaveWeightsAfterSteps(ModelCheckpoint): def __init__(self, save_interval, save_path, _config, **kwargs): if save_interval: @@ -65,9 +66,7 @@ class SaveWeightsAfterSteps(ModelCheckpoint): super()._save_handler(filepath) with open(os.path.join(filepath, "config.json"), "w") as fp: json.dump(self._config, fp) # encode dict into JSON - - - + def configuration(): try: for device in tf.config.list_physical_devices('GPU'): @@ -272,6 +271,9 @@ def run(_config, skewing_amplitudes=None, max_len=None, ): + """ + run configured experiment via sacred + """ if pretraining and not os.path.isfile(RESNET50_WEIGHTS_PATH): _log.info("downloading RESNET50 pretrained weights to %s", RESNET50_WEIGHTS_PATH) @@ -312,7 +314,7 @@ def run(_config, imgs_list = list(os.listdir(dir_img)) segs_list = list(os.listdir(dir_seg)) - + imgs_list_test = list(os.listdir(dir_img_val)) segs_list_test = list(os.listdir(dir_seg_val)) @@ -380,7 +382,7 @@ def run(_config, num_patches_x = transformer_num_patches_xy[0] num_patches_y = transformer_num_patches_xy[1] num_patches = num_patches_x * num_patches_y - + if transformer_cnn_first: model_builder = vit_resnet50_unet multiple_of_32 = True @@ -413,13 +415,13 @@ def run(_config, model_builder.config = _config model_builder.logger = _log model = model_builder(num_patches) - + assert model is not None #if you want to see the model structure just uncomment model summary. #model.summary() - + if task in ["segmentation", "binarization"]: - if is_loss_soft_dice: + if is_loss_soft_dice: loss = soft_dice_loss elif weighted_loss: loss = weighted_categorical_crossentropy(weights) @@ -434,7 +436,7 @@ def run(_config, ignore_class=0, sparse_y_true=False, sparse_y_pred=False)]) - + # generating train and evaluation data gen_kwargs = dict(batch_size=n_batch, input_height=input_height, @@ -447,7 +449,7 @@ def run(_config, ##img_validation_patches = os.listdir(dir_flow_eval_imgs) ##score_best=[] ##score_best.append(0) - + callbacks = [TensorBoard(os.path.join(dir_output, 'logs'), write_graph=False), SaveWeightsAfterSteps(0, dir_output, _config)] if save_interval: @@ -471,7 +473,7 @@ def run(_config, #os.system('rm -rf '+dir_eval_flowing) #model.save(dir_output+'/'+'model'+'.h5') - + elif task=="cnn-rnn-ocr": dir_img, dir_lab = get_dirs_or_files(dir_train) @@ -480,7 +482,7 @@ def run(_config, labs_list = list(os.listdir(dir_lab)) imgs_list_val = list(os.listdir(dir_img_val)) labs_list_val = list(os.listdir(dir_lab_val)) - + with open(characters_txt_file, 'r') as char_txt_f: characters = json.load(char_txt_f) padding_token = len(characters) + 5 @@ -533,7 +535,7 @@ def run(_config, #tf.keras.optimizers.schedules.CosineDecay(initial_learning_rate, decay_steps, alpha) opt = tf.keras.optimizers.Adam(learning_rate=learning_rate) model.compile(optimizer=opt) # rs: loss seems to be (ctc_batch_cost) in last layer - + callbacks = [TensorBoard(os.path.join(dir_output, 'logs'), write_graph=False), SaveWeightsAfterSteps(0, dir_output, _config)] if save_interval: @@ -544,7 +546,7 @@ def run(_config, epochs=n_epochs, callbacks=callbacks, initial_epoch=index_start) - + elif task=='classification': if continue_training: model = load_model(dir_of_start_model, compile=False) @@ -573,7 +575,7 @@ def run(_config, monitor='val_f1', #save_best_only=True, # we need all for ensembling mode='max')] - + history = model.fit(trainXY, #class_weight=weights) validation_data=testXY, @@ -586,28 +588,12 @@ def run(_config, f1_threshold_classification) if len(usable_checkpoints) >= 1: _log.info("averaging over usable checkpoints: %s", str(usable_checkpoints)) - all_weights = [] - for epoch in usable_checkpoints: - cp_path = os.path.join(dir_output, 'model_{epoch:02d}'.format(epoch=epoch + 1)) - assert os.path.isdir(cp_path), cp_path - model = load_model(cp_path, compile=False) - all_weights.append(model.get_weights()) + usable_checkpoints = [os.path.join(dir_output, 'model_{epoch:02d}'.format(epoch=epoch + 1)) + for epoch in usable_checkpoints] + ens_path = os.path.join(dir_output, 'model_ens_avg') + run_ensembling(usable_checkpoints, ens_path) + _log.info("ensemble model saved under '%s'", ens_path) - new_weights = [] - for layer_weights in zip(*all_weights): - layer_weights = np.array([np.array(weights).mean(axis=0) - for weights in zip(*layer_weights)]) - new_weights.append(layer_weights) - - #model = tf.keras.models.clone_model(model) - model.set_weights(new_weights) - - cp_path = os.path.join(dir_output, 'model_ens_avg') - model.save(cp_path) - with open(os.path.join(cp_path, "config.json"), "w") as fp: - json.dump(_config, fp) # encode dict into JSON - _log.info("ensemble model saved under '%s'", cp_path) - elif task=='reading_order': if continue_training: model = load_model(dir_of_start_model, compile=False) @@ -618,10 +604,10 @@ def run(_config, input_width, weight_decay, pretraining) - + dir_flow_train_imgs = os.path.join(dir_train, 'images') dir_flow_train_labels = os.path.join(dir_train, 'labels') - + classes = os.listdir(dir_flow_train_labels) if augmentation: num_rows = len(classes)*(len(thetha) + 1) @@ -634,7 +620,7 @@ def run(_config, #optimizer=SGD(learning_rate=0.01, momentum=0.9), optimizer=Adam(learning_rate=0.0001), # rs: why not learning_rate? metrics=['accuracy']) - + callbacks = [TensorBoard(os.path.join(dir_output, 'logs'), write_graph=False), SaveWeightsAfterSteps(0, dir_output, _config)] if save_interval: @@ -657,5 +643,3 @@ def run(_config, model_dir = os.path.join(dir_out,'model_best') model.save(model_dir) ''' - - diff --git a/src/eynollah/training/weights_ensembling.py b/src/eynollah/training/weights_ensembling.py index 6dce7fd..01532fd 100644 --- a/src/eynollah/training/weights_ensembling.py +++ b/src/eynollah/training/weights_ensembling.py @@ -1,136 +1,66 @@ -import sys -from glob import glob -from os import environ, devnull -from os.path import join -from warnings import catch_warnings, simplefilter import os +from warnings import catch_warnings, simplefilter +import click import numpy as np -from PIL import Image -import cv2 -environ['TF_CPP_MIN_LOG_LEVEL'] = '3' -stderr = sys.stderr -sys.stderr = open(devnull, 'w') + +os.environ['TF_USE_LEGACY_KERAS'] = '1' # avoid Keras 3 after TF 2.15 +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' + +from ocrd_utils import tf_disable_interactive_logs +tf_disable_interactive_logs() import tensorflow as tf from tensorflow.keras.models import load_model -from tensorflow.python.keras import backend as tensorflow_backend -sys.stderr = stderr -from tensorflow.keras import layers -import tensorflow.keras.losses -from tensorflow.keras.layers import * -import click -import logging - -class Patches(layers.Layer): - def __init__(self, patch_size_x, patch_size_y): - super(Patches, self).__init__() - self.patch_size_x = patch_size_x - self.patch_size_y = patch_size_y - - def call(self, images): - #print(tf.shape(images)[1],'images') - #print(self.patch_size,'self.patch_size') - batch_size = tf.shape(images)[0] - patches = tf.image.extract_patches( - images=images, - sizes=[1, self.patch_size_y, self.patch_size_x, 1], - strides=[1, self.patch_size_y, self.patch_size_x, 1], - rates=[1, 1, 1, 1], - padding="VALID", - ) - #patch_dims = patches.shape[-1] - patch_dims = tf.shape(patches)[-1] - patches = tf.reshape(patches, [batch_size, -1, patch_dims]) - return patches - def get_config(self): - - config = super().get_config().copy() - config.update({ - 'patch_size_x': self.patch_size_x, - 'patch_size_y': self.patch_size_y, - }) - return config - - - -class PatchEncoder(layers.Layer): - def __init__(self, **kwargs): - super(PatchEncoder, self).__init__() - self.num_patches = num_patches - self.projection = layers.Dense(units=projection_dim) - self.position_embedding = layers.Embedding( - input_dim=num_patches, output_dim=projection_dim - ) - - def call(self, patch): - positions = tf.range(start=0, limit=self.num_patches, delta=1) - encoded = self.projection(patch) + self.position_embedding(positions) - return encoded - def get_config(self): - - config = super().get_config().copy() - config.update({ - 'num_patches': self.num_patches, - 'projection': self.projection, - 'position_embedding': self.position_embedding, - }) - return config +from .models import ( + PatchEncoder, + Patches, +) - -def start_new_session(): - ###config = tf.compat.v1.ConfigProto() - ###config.gpu_options.allow_growth = True +def run_ensembling(model_dirs, out_dir): + all_weights = [] - ###self.session = tf.compat.v1.Session(config=config) # tf.InteractiveSession() - ###tensorflow_backend.set_session(self.session) - - config = tf.compat.v1.ConfigProto() - config.gpu_options.allow_growth = True - - session = tf.compat.v1.Session(config=config) # tf.InteractiveSession() - tensorflow_backend.set_session(session) - return session - -def run_ensembling(dir_models, out): - ls_models = os.listdir(dir_models) - - - weights=[] - - for model_name in ls_models: - model = load_model(os.path.join(dir_models,model_name) , compile=False, custom_objects={'PatchEncoder':PatchEncoder, 'Patches': Patches}) - weights.append(model.get_weights()) + for model_dir in model_dirs: + assert os.path.isdir(model_dir), model_dir + model = load_model(model_dir, compile=False, + custom_objects=dict(PatchEncoder=PatchEncoder, + Patches=Patches)) + all_weights.append(model.get_weights()) - new_weights = list() + new_weights = [] + for layer_weights in zip(*all_weights): + layer_weights = np.array([np.array(weights).mean(axis=0) + for weights in zip(*layer_weights)]) + new_weights.append(layer_weights) - for weights_list_tuple in zip(*weights): - new_weights.append( - [np.array(weights_).mean(axis=0)\ - for weights_ in zip(*weights_list_tuple)]) - - - - new_weights = [np.array(x) for x in new_weights] - + #model = tf.keras.models.clone_model(model) model.set_weights(new_weights) - model.save(out) - os.system('cp '+os.path.join(os.path.join(dir_models,model_name) , "config.json ")+out) + + model.save(out_dir) + os.system('cp ' + os.path.join(model_dirs[0], "config.json ") + out_dir + "/") @click.command() @click.option( - "--dir_models", - "-dm", - help="directory of models", + "--in", + "-i", + help="input directory of checkpoint models to be read", + multiple=True, + required=True, type=click.Path(exists=True, file_okay=False), ) @click.option( "--out", "-o", help="output directory where ensembled model will be written.", + required=True, type=click.Path(exists=False, file_okay=False), ) +def ensemble_cli(in_, out): + """ + mix multiple model weights + + Load a sequence of models and mix them into a single ensemble model + by averaging their weights. Write the resulting model. + """ + run_ensembling(in_, out) -def main(dir_models, out): - run_ensembling(dir_models, out) - From 2492c257c6a81955915b8175344027cbd4d355d5 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 7 Feb 2026 16:52:54 +0100 Subject: [PATCH 071/118] ocrd-tool.json: re-instante light_version and textline_light dummies for backwards compatibility --- src/eynollah/ocrd-tool.json | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/src/eynollah/ocrd-tool.json b/src/eynollah/ocrd-tool.json index 3b500fc..fc61af7 100644 --- a/src/eynollah/ocrd-tool.json +++ b/src/eynollah/ocrd-tool.json @@ -28,7 +28,19 @@ "full_layout": { "type": "boolean", "default": true, - "description": "Try to detect all element subtypes, including drop-caps and headings" + "description": "Try to detect all region subtypes, including drop-capital and heading" + }, + "light_version": { + "type": "boolean", + "default": true, + "enum": [true], + "description": "ignored (only for backwards-compatibility)" + }, + "textline_light": { + "type": "boolean", + "default": true, + "enum": [true], + "description": "ignored (only for backwards-compatibility)" }, "tables": { "type": "boolean", @@ -38,12 +50,12 @@ "curved_line": { "type": "boolean", "default": false, - "description": "try to return contour of textlines instead of just rectangle bounding box. Needs more processing time" + "description": "retrieve textline polygons independent of each other (needs more processing time)" }, "ignore_page_extraction": { "type": "boolean", "default": false, - "description": "if this parameter set to true, this tool would ignore page extraction" + "description": "if true, do not attempt page frame detection (cropping)" }, "allow_scaling": { "type": "boolean", @@ -58,7 +70,7 @@ "right_to_left": { "type": "boolean", "default": false, - "description": "if this parameter set to true, this tool will extract right-to-left reading order." + "description": "if true, return reading order in right-to-left reading direction." }, "headers_off": { "type": "boolean", @@ -123,13 +135,22 @@ } }, "resources": [ + { + "url": "https://zenodo.org/records/17580627/files/models_all_v0_7_0.zip?download=1", + "name": "models_layout_v0_7_0", + "type": "archive", + "size": 6119874002, + "description": "Models for layout detection, reading order detection, textline detection, page extraction, column classification, table detection, binarization, image enhancement and OCR", + "version_range": ">= v0.7.0" + }, { "url": "https://github.com/qurator-spk/sbb_binarization/releases/download/v0.0.11/saved_model_2020_01_16.zip", "name": "default", "type": "archive", "path_in_archive": "saved_model_2020_01_16", "size": 563147331, - "description": "default models provided by github.com/qurator-spk (SavedModel format)" + "description": "default models provided by github.com/qurator-spk (SavedModel format)", + "version_range": "< v0.7.0" }, { "url": "https://github.com/qurator-spk/sbb_binarization/releases/download/v0.0.11/saved_model_2021_03_09.zip", @@ -137,7 +158,8 @@ "type": "archive", "path_in_archive": ".", "size": 133230419, - "description": "updated default models provided by github.com/qurator-spk (SavedModel format)" + "description": "updated default models provided by github.com/qurator-spk (SavedModel format)", + "version_range": "< v0.7.0" } ] } From ea285124ce11aa9c00d02d2e939803a067931a61 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 8 Feb 2026 01:06:57 +0100 Subject: [PATCH 072/118] fix Patches/PatchEncoder (make configurable again) --- src/eynollah/patch_encoder.py | 52 ++++++++++++++------------------- src/eynollah/training/models.py | 22 +++----------- 2 files changed, 26 insertions(+), 48 deletions(-) diff --git a/src/eynollah/patch_encoder.py b/src/eynollah/patch_encoder.py index dc0a291..07b843d 100644 --- a/src/eynollah/patch_encoder.py +++ b/src/eynollah/patch_encoder.py @@ -3,52 +3,44 @@ os.environ['TF_USE_LEGACY_KERAS'] = '1' # avoid Keras 3 after TF 2.15 import tensorflow as tf from tensorflow.keras import layers -projection_dim = 64 -patch_size = 1 -num_patches =21*21#14*14#28*28#14*14#28*28 - class PatchEncoder(layers.Layer): - def __init__(self): + # 441=21*21 # 14*14 # 28*28 + def __init__(self, num_patches=441, projection_dim=64): super().__init__() - self.projection = layers.Dense(units=projection_dim) - self.position_embedding = layers.Embedding(input_dim=num_patches, output_dim=projection_dim) + self.num_patches = num_patches + self.projection_dim = projection_dim + self.projection = layers.Dense(self.projection_dim) + self.position_embedding = layers.Embedding(self.num_patches, self.projection_dim) def call(self, patch): - positions = tf.range(start=0, limit=num_patches, delta=1) - encoded = self.projection(patch) + self.position_embedding(positions) - return encoded + positions = tf.range(start=0, limit=self.num_patches, delta=1) + return self.projection(patch) + self.position_embedding(positions) def get_config(self): - config = super().get_config().copy() - config.update({ - 'num_patches': num_patches, - 'projection': self.projection, - 'position_embedding': self.position_embedding, - }) - return config + return dict(num_patches=self.num_patches, + projection_dim=self.projection_dim, + **super().get_config()) class Patches(layers.Layer): - def __init__(self, **kwargs): - super(Patches, self).__init__() - self.patch_size = patch_size + def __init__(self, patch_size_x=1, patch_size_y=1): + super().__init__() + self.patch_size_x = patch_size_x + self.patch_size_y = patch_size_y def call(self, images): batch_size = tf.shape(images)[0] patches = tf.image.extract_patches( images=images, - sizes=[1, self.patch_size, self.patch_size, 1], - strides=[1, self.patch_size, self.patch_size, 1], + sizes=[1, self.patch_size_y, self.patch_size_x, 1], + strides=[1, self.patch_size_y, self.patch_size_x, 1], rates=[1, 1, 1, 1], padding="VALID", ) patch_dims = patches.shape[-1] - patches = tf.reshape(patches, [batch_size, -1, patch_dims]) - return patches - def get_config(self): + return tf.reshape(patches, [batch_size, -1, patch_dims]) - config = super().get_config().copy() - config.update({ - 'patch_size': self.patch_size, - }) - return config + def get_config(self): + return dict(patch_size_x=self.patch_size_x, + patch_size_y=self.patch_size_y, + **super().get_config()) diff --git a/src/eynollah/training/models.py b/src/eynollah/training/models.py index d1148f1..b0ad51c 100644 --- a/src/eynollah/training/models.py +++ b/src/eynollah/training/models.py @@ -423,16 +423,9 @@ def vit_resnet50_unet(num_patches, #num_patches = x.shape[1]*x.shape[2] - # rs: fixme patch size not configurable anymore... - #patches = Patches(transformer_patchsize_x, transformer_patchsize_y)(inputs) - patches = Patches()(x) - assert transformer_patchsize_x == transformer_patchsize_y == 1 + patches = Patches(transformer_patchsize_x, transformer_patchsize_y)(x) # Encode patches. - # rs: fixme num patches and dim not configurable anymore... - #encoded_patches = PatchEncoder(num_patches, transformer_projection_dim)(patches) - encoded_patches = PatchEncoder()(patches) - assert num_patches == 21 * 21 - assert transformer_projection_dim == 64 + encoded_patches = PatchEncoder(num_patches, transformer_projection_dim)(patches) for _ in range(transformer_layers): # Layer normalization 1. @@ -530,16 +523,9 @@ def vit_resnet50_unet_transformer_before_cnn(num_patches, IMAGE_ORDERING = 'channels_last' bn_axis=3 - # rs: fixme patch size not configurable anymore... - #patches = Patches(transformer_patchsize_x, transformer_patchsize_y)(inputs) - patches = Patches()(inputs) - assert transformer_patchsize_x == transformer_patchsize_y == 1 + patches = Patches(transformer_patchsize_x, transformer_patchsize_y)(inputs) # Encode patches. - # rs: fixme num patches and dim not configurable anymore... - #encoded_patches = PatchEncoder(num_patches, transformer_projection_dim)(patches) - encoded_patches = PatchEncoder()(patches) - assert num_patches == 21 * 21 - assert transformer_projection_dim == 64 + encoded_patches = PatchEncoder(num_patches, transformer_projection_dim)(patches) for _ in range(transformer_layers): # Layer normalization 1. From 53252a59c6619bbf0d164a8c7fc6c98449b208ec Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 8 Feb 2026 01:09:40 +0100 Subject: [PATCH 073/118] training.models: fix glitch introduced in 3a73ccca --- src/eynollah/training/models.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/eynollah/training/models.py b/src/eynollah/training/models.py index b0ad51c..5b23ecd 100644 --- a/src/eynollah/training/models.py +++ b/src/eynollah/training/models.py @@ -443,7 +443,6 @@ def vit_resnet50_unet(num_patches, # Skip connection 2. encoded_patches = Add()([x3, x2]) - assert isinstance(x, Layer) encoded_patches = tf.reshape(encoded_patches, [-1, x.shape[1], x.shape[2], transformer_projection_dim // (transformer_patchsize_x * From ee4bffd81d211697b608b93bf2a3986de1f4ed85 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 8 Feb 2026 01:10:13 +0100 Subject: [PATCH 074/118] training.train: simplify transformer cfg checks --- src/eynollah/training/train.py | 37 +++++++++++++++------------------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index 217ab35..ecf70b4 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -385,31 +385,26 @@ def run(_config, if transformer_cnn_first: model_builder = vit_resnet50_unet - multiple_of_32 = True + multiple = 32 else: model_builder = vit_resnet50_unet_transformer_before_cnn - multiple_of_32 = False + multiple = 1 - assert input_height == (num_patches_y * - transformer_patchsize_y * - (32 if multiple_of_32 else 1)), \ - "transformer_patchsize_y or transformer_num_patches_xy height value error: " \ - "input_height should be equal to " \ - "(transformer_num_patches_xy height value * transformer_patchsize_y%s)" % \ - " * 32" if multiple_of_32 else "" - assert input_width == (num_patches_x * - transformer_patchsize_x * - (32 if multiple_of_32 else 1)), \ - "transformer_patchsize_x or transformer_num_patches_xy width value error: " \ - "input_width should be equal to " \ - "(transformer_num_patches_xy width value * transformer_patchsize_x%s)" % \ - " * 32" if multiple_of_32 else "" + assert input_height == ( + num_patches_y * transformer_patchsize_y * multiple), ( + "transformer_patchsize_y or transformer_num_patches_xy height value error: " + "input_height should be equal to " + "(transformer_num_patches_xy height value * transformer_patchsize_y * %d)" % multiple) + assert input_width == ( + num_patches_x * transformer_patchsize_x * multiple), ( + "transformer_patchsize_x or transformer_num_patches_xy width value error: " + "input_width should be equal to " + "(transformer_num_patches_xy width value * transformer_patchsize_x * %d)" % multiple) assert 0 == (transformer_projection_dim % - (transformer_patchsize_y * - transformer_patchsize_x)), \ - "transformer_projection_dim error: " \ - "The remainder when parameter transformer_projection_dim is divided by " \ - "(transformer_patchsize_y*transformer_patchsize_x) should be zero" + (transformer_patchsize_y * transformer_patchsize_x)), ( + "transformer_projection_dim error: " + "The remainder when parameter transformer_projection_dim is divided by " + "(transformer_patchsize_y*transformer_patchsize_x) should be zero") model_builder = create_captured_function(model_builder) model_builder.config = _config From 7b7ef041ec397fe89c62f9bc1be843b09285f941 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 8 Feb 2026 01:10:56 +0100 Subject: [PATCH 075/118] training.models: use asymmetric zero padding instead of lambda layer --- src/eynollah/training/models.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/eynollah/training/models.py b/src/eynollah/training/models.py index 5b23ecd..115a196 100644 --- a/src/eynollah/training/models.py +++ b/src/eynollah/training/models.py @@ -69,16 +69,9 @@ def mlp(x, hidden_units, dropout_rate): return x def one_side_pad(x): - # rs: fixme: lambda layers are problematic for de/serialization! - # - can we use ZeroPadding1D instead of ZeroPadding2D+Lambda? - x = ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING)(x) - if IMAGE_ORDERING == 'channels_first': - x = Lambda(lambda x: x[:, :, :-1, :-1])(x) - elif IMAGE_ORDERING == 'channels_last': - x = Lambda(lambda x: x[:, :-1, :-1, :])(x) + x = ZeroPadding2D(((1, 0), (1, 0)), data_format=IMAGE_ORDERING)(x) return x - def identity_block(input_tensor, kernel_size, filters, stage, block): """The identity block is the block that has no conv layer at shortcut. # Arguments From 37338049af618383ca2f2c6708dd91b294b77872 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 8 Feb 2026 01:11:44 +0100 Subject: [PATCH 076/118] training: use relative imports --- src/eynollah/training/inference.py | 2 +- src/eynollah/training/models.py | 2 +- src/eynollah/training/weights_ensembling.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/eynollah/training/inference.py b/src/eynollah/training/inference.py index 2b26210..c38b79f 100644 --- a/src/eynollah/training/inference.py +++ b/src/eynollah/training/inference.py @@ -25,7 +25,7 @@ from .gt_gen_utils import ( resize_image, update_list_and_return_first_with_length_bigger_than_one ) -from .models import ( +from ..patch_encoder import ( PatchEncoder, Patches ) diff --git a/src/eynollah/training/models.py b/src/eynollah/training/models.py index 115a196..6182c9e 100644 --- a/src/eynollah/training/models.py +++ b/src/eynollah/training/models.py @@ -30,7 +30,7 @@ from tensorflow.keras.layers import ( from tensorflow.keras.models import Model from tensorflow.keras.regularizers import l2 -from eynollah.patch_encoder import Patches, PatchEncoder +from ..patch_encoder import Patches, PatchEncoder ##mlp_head_units = [512, 256]#[2048, 1024] ###projection_dim = 64 diff --git a/src/eynollah/training/weights_ensembling.py b/src/eynollah/training/weights_ensembling.py index 01532fd..e3ede24 100644 --- a/src/eynollah/training/weights_ensembling.py +++ b/src/eynollah/training/weights_ensembling.py @@ -12,7 +12,7 @@ tf_disable_interactive_logs() import tensorflow as tf from tensorflow.keras.models import load_model -from .models import ( +from ..patch_encoder import ( PatchEncoder, Patches, ) From 514a897dd5392bf7a160bf02c82b04da1fc53bb0 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 8 Feb 2026 01:11:57 +0100 Subject: [PATCH 077/118] training.train: assert n_epochs vs. index_start --- src/eynollah/training/train.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index ecf70b4..73d5e0b 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -275,6 +275,9 @@ def run(_config, run configured experiment via sacred """ + if continue_training: + assert n_epochs > index_start, "with continue_training, n_epochs must be greater than index_start" + if pretraining and not os.path.isfile(RESNET50_WEIGHTS_PATH): _log.info("downloading RESNET50 pretrained weights to %s", RESNET50_WEIGHTS_PATH) download_file(RESNET50_WEIGHTS_URL, RESNET50_WEIGHTS_PATH) From 83c2408192950f472e7c8960170cb270ba1a63af Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 8 Feb 2026 01:25:53 +0100 Subject: [PATCH 078/118] training.utils.data_gen: avoid repeated array allocation --- src/eynollah/training/utils.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/eynollah/training/utils.py b/src/eynollah/training/utils.py index 56d6bdf..a03d539 100644 --- a/src/eynollah/training/utils.py +++ b/src/eynollah/training/utils.py @@ -600,10 +600,9 @@ def data_gen(img_folder, mask_folder, batch_size, input_height, input_width, n_c c = 0 n = [f for f in os.listdir(img_folder) if not f.startswith('.')] # os.listdir(img_folder) #List of training images random.shuffle(n) + img = np.zeros((batch_size, input_height, input_width, 3), dtype=float) + mask = np.zeros((batch_size, input_height, input_width, n_classes), dtype=float) while True: - img = np.zeros((batch_size, input_height, input_width, 3)).astype('float') - mask = np.zeros((batch_size, input_height, input_width, n_classes)).astype('float') - for i in range(c, c + batch_size): # initially from 0 to 16, c = 0. try: filename = os.path.splitext(n[i])[0] @@ -612,21 +611,22 @@ def data_gen(img_folder, mask_folder, batch_size, input_height, input_width, n_c train_img = cv2.resize(train_img, (input_width, input_height), interpolation=cv2.INTER_NEAREST) # Read an image from folder and resize - img[i - c] = train_img # add to array - img[0], img[1], and so on. + img[i - c, :] = train_img # add to array - img[0], img[1], and so on. if task == "segmentation" or task=="binarization": train_mask = cv2.imread(mask_folder + '/' + filename + '.png') - train_mask = get_one_hot(resize_image(train_mask, input_height, input_width), input_height, input_width, - n_classes) + train_mask = resize_image(train_mask, input_height, input_width) + train_mask = get_one_hot(train_mask, input_height, input_width, n_classes) elif task == "enhancement": train_mask = cv2.imread(mask_folder + '/' + filename + '.png')/255. train_mask = resize_image(train_mask, input_height, input_width) # train_mask = train_mask.reshape(224, 224, 1) # Add extra dimension for parity with train_img size [512 * 512 * 3] - mask[i - c] = train_mask - except: - img[i - c] = np.ones((input_height, input_width, 3)).astype('float') - mask[i - c] = np.zeros((input_height, input_width, n_classes)).astype('float') + mask[i - c, :] = train_mask + except Exception as e: + print(str(e)) + img[i - c, :] = 1. + mask[i - c, :] = 0. c += batch_size if c + batch_size >= len(os.listdir(img_folder)): From 7888fa5968d12bf5d485705b90c805f922997d89 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 8 Feb 2026 04:42:44 +0100 Subject: [PATCH 079/118] training: remove `data_gen` in favor of tf.data pipelines instead of looping over file pairs indefinitely, yielding Numpy arrays: re-use `keras.utils.image_dataset_from_directory` here as well, but with img/label generators zipped together (thus, everything will already be loaded/prefetched on the GPU) --- src/eynollah/training/train.py | 61 ++++++++++++++++++---------------- src/eynollah/training/utils.py | 38 --------------------- 2 files changed, 32 insertions(+), 67 deletions(-) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index 73d5e0b..05a7346 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -13,6 +13,7 @@ from tensorflow.keras.models import load_model from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard from tensorflow.keras.layers import StringLookup from tensorflow.keras.utils import image_dataset_from_directory +from tensorflow.keras.backend import one_hot from sacred import Experiment from sacred.config import create_captured_function @@ -36,7 +37,6 @@ from .models import ( RESNET50_WEIGHTS_URL ) from .utils import ( - data_gen, generate_arrays_from_folder_reading_order, get_one_hot, preprocess_imgs, @@ -435,43 +435,46 @@ def run(_config, sparse_y_true=False, sparse_y_pred=False)]) - # generating train and evaluation data - gen_kwargs = dict(batch_size=n_batch, - input_height=input_height, - input_width=input_width, - n_classes=n_classes, - task=task) - train_gen = data_gen(dir_flow_train_imgs, dir_flow_train_labels, **gen_kwargs) - val_gen = data_gen(dir_flow_eval_imgs, dir_flow_eval_labels, **gen_kwargs) - - ##img_validation_patches = os.listdir(dir_flow_eval_imgs) - ##score_best=[] - ##score_best.append(0) + def get_dataset(dir_imgs, dir_labs, shuffle=None): + gen_kwargs = dict(labels=None, + label_mode=None, + batch_size=1, # batch after zip below + image_size=(input_height, input_width), + color_mode='rgb', + shuffle=shuffle is not None, + seed=shuffle, + interpolation='nearest', + crop_to_aspect_ratio=False, + # Keras 3 only... + #pad_to_aspect_ratio=False, + #data_format='channel_last', + #verbose=False, + ) + img_gen = image_dataset_from_directory(dir_imgs, **gen_kwargs) + lab_gen = image_dataset_from_directory(dir_labs, **gen_kwargs) + if task in ["segmentation", "binarization"]: + @tf.function + def to_categorical(seg): + seg = tf.image.rgb_to_grayscale(seg) + seg = tf.cast(seg, tf.int8) + seg = tf.squeeze(seg, axis=-1) + return one_hot(seg, n_classes) + lab_gen = lab_gen.map(to_categorical) + return tf.data.Dataset.zip(img_gen, lab_gen).rebatch(n_batch, drop_remainder=True) + train_gen = get_dataset(dir_flow_train_imgs, dir_flow_train_labels, shuffle=np.random.randint(1e6)) + val_gen = get_dataset(dir_flow_eval_imgs, dir_flow_eval_labels) callbacks = [TensorBoard(os.path.join(dir_output, 'logs'), write_graph=False), SaveWeightsAfterSteps(0, dir_output, _config)] if save_interval: callbacks.append(SaveWeightsAfterSteps(save_interval, dir_output, _config)) - - steps_train = len(os.listdir(dir_flow_train_imgs)) // n_batch # - 1 - steps_val = len(os.listdir(dir_flow_eval_imgs)) // n_batch - _log.info("training on %d batches in %d epochs", steps_train, n_epochs) - _log.info("validating on %d batches", steps_val) model.fit( - train_gen, - steps_per_epoch=steps_train, - validation_data=val_gen, - #validation_steps=1, # rs: only one batch?? - validation_steps=steps_val, + train_gen.prefetch(tf.data.AUTOTUNE), # .repeat()?? + validation_data=val_gen.prefetch(tf.data.AUTOTUNE), epochs=n_epochs, callbacks=callbacks, initial_epoch=index_start) - #os.system('rm -rf '+dir_train_flowing) - #os.system('rm -rf '+dir_eval_flowing) - - #model.save(dir_output+'/'+'model'+'.h5') - elif task=="cnn-rnn-ocr": dir_img, dir_lab = get_dirs_or_files(dir_train) @@ -524,7 +527,7 @@ def run(_config, drop_remainder=True, #num_parallel_calls=tf.data.AUTOTUNE, ) - train_ds = train_ds.repeat().shuffle().prefetch(20) + train_ds = train_ds.prefetch(tf.data.AUTOTUNE) #initial_learning_rate = 1e-4 #decay_steps = int (n_epochs * ( len_dataset / n_batch )) diff --git a/src/eynollah/training/utils.py b/src/eynollah/training/utils.py index a03d539..f2f4bdc 100644 --- a/src/eynollah/training/utils.py +++ b/src/eynollah/training/utils.py @@ -596,44 +596,6 @@ def generate_arrays_from_folder_reading_order(classes_file_dir, modal_dir, n_bat ret_y= np.zeros((n_batch, n_classes)).astype(np.int16) batchcount = 0 -def data_gen(img_folder, mask_folder, batch_size, input_height, input_width, n_classes, task='segmentation'): - c = 0 - n = [f for f in os.listdir(img_folder) if not f.startswith('.')] # os.listdir(img_folder) #List of training images - random.shuffle(n) - img = np.zeros((batch_size, input_height, input_width, 3), dtype=float) - mask = np.zeros((batch_size, input_height, input_width, n_classes), dtype=float) - while True: - for i in range(c, c + batch_size): # initially from 0 to 16, c = 0. - try: - filename = os.path.splitext(n[i])[0] - - train_img = cv2.imread(img_folder + '/' + n[i]) / 255. - train_img = cv2.resize(train_img, (input_width, input_height), - interpolation=cv2.INTER_NEAREST) # Read an image from folder and resize - - img[i - c, :] = train_img # add to array - img[0], img[1], and so on. - if task == "segmentation" or task=="binarization": - train_mask = cv2.imread(mask_folder + '/' + filename + '.png') - train_mask = resize_image(train_mask, input_height, input_width) - train_mask = get_one_hot(train_mask, input_height, input_width, n_classes) - elif task == "enhancement": - train_mask = cv2.imread(mask_folder + '/' + filename + '.png')/255. - train_mask = resize_image(train_mask, input_height, input_width) - - # train_mask = train_mask.reshape(224, 224, 1) # Add extra dimension for parity with train_img size [512 * 512 * 3] - - mask[i - c, :] = train_mask - except Exception as e: - print(str(e)) - img[i - c, :] = 1. - mask[i - c, :] = 0. - - c += batch_size - if c + batch_size >= len(os.listdir(img_folder)): - c = 0 - random.shuffle(n) - yield img, mask - # TODO: Use otsu_copy from utils def otsu_copy(img): From 4414f7b89b4e1488a6955bb40342709ab05c0414 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 17 Feb 2026 14:18:32 +0100 Subject: [PATCH 080/118] training.models.vit_resnet50_unet: re-use `IMAGE_ORDERING` --- src/eynollah/training/models.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/src/eynollah/training/models.py b/src/eynollah/training/models.py index 6182c9e..0dc78d2 100644 --- a/src/eynollah/training/models.py +++ b/src/eynollah/training/models.py @@ -372,12 +372,10 @@ def vit_resnet50_unet(num_patches, transformer_mlp_head_units = [128, 64] inputs = Input(shape=(input_height, input_width, 3)) - #transformer_units = [ - #projection_dim * 2, - #projection_dim, - #] # Size of the transformer layers - IMAGE_ORDERING = 'channels_last' - bn_axis=3 + if IMAGE_ORDERING == 'channels_last': + bn_axis = 3 + else: + bn_axis = 1 x = ZeroPadding2D((3, 3), data_format=IMAGE_ORDERING)(inputs) x = Conv2D(64, (7, 7), data_format=IMAGE_ORDERING, strides=(2, 2),kernel_regularizer=l2(weight_decay), name='conv1')(x) @@ -508,12 +506,10 @@ def vit_resnet50_unet_transformer_before_cnn(num_patches, transformer_mlp_head_units = [128, 64] inputs = Input(shape=(input_height, input_width, 3)) - ##transformer_units = [ - ##projection_dim * 2, - ##projection_dim, - ##] # Size of the transformer layers - IMAGE_ORDERING = 'channels_last' - bn_axis=3 + if IMAGE_ORDERING == 'channels_last': + bn_axis = 3 + else: + bn_axis = 1 patches = Patches(transformer_patchsize_x, transformer_patchsize_y)(inputs) # Encode patches. From fcd10c39567376675ec77500ab12645b77cf2c68 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 17 Feb 2026 14:52:04 +0100 Subject: [PATCH 081/118] training.models: re-use RESNET50 builder (+weight init) code --- src/eynollah/training/models.py | 223 +++----------------------------- 1 file changed, 21 insertions(+), 202 deletions(-) diff --git a/src/eynollah/training/models.py b/src/eynollah/training/models.py index 0dc78d2..406e937 100644 --- a/src/eynollah/training/models.py +++ b/src/eynollah/training/models.py @@ -154,19 +154,13 @@ def conv_block(input_tensor, kernel_size, filters, stage, block, strides=(2, 2)) x = Activation('relu')(x) return x - -def resnet50_unet_light(n_classes, input_height=224, input_width=224, task="segmentation", weight_decay=1e-6, pretraining=False): - assert input_height % 32 == 0 - assert input_width % 32 == 0 - - img_input = Input(shape=(input_height, input_width, 3)) - +def resnet50(inputs, weight_decay=1e-6, pretraining=False): if IMAGE_ORDERING == 'channels_last': bn_axis = 3 else: bn_axis = 1 - x = ZeroPadding2D((3, 3), data_format=IMAGE_ORDERING)(img_input) + x = ZeroPadding2D((3, 3), data_format=IMAGE_ORDERING)(inputs) x = Conv2D(64, (7, 7), data_format=IMAGE_ORDERING, strides=(2, 2), kernel_regularizer=l2(weight_decay), name='conv1')(x) f1 = x @@ -200,7 +194,17 @@ def resnet50_unet_light(n_classes, input_height=224, input_width=224, task="segm f5 = x if pretraining: - model = Model(img_input, x).load_weights(RESNET50_WEIGHTS_PATH) + model = Model(inputs, x).load_weights(RESNET50_WEIGHTS_PATH) + + return f1, f2, f3, f4, f5 + +def resnet50_unet_light(n_classes, input_height=224, input_width=224, task="segmentation", weight_decay=1e-6, pretraining=False): + assert input_height % 32 == 0 + assert input_width % 32 == 0 + + img_input = Input(shape=(input_height, input_width, 3)) + + f1, f2, f3, f4, f5 = resnet50(img_input, weight_decay, pretraining) v512_2048 = Conv2D(512, (1, 1), padding='same', data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay))(f5) v512_2048 = (BatchNormalization(axis=bn_axis))(v512_2048) @@ -262,46 +266,7 @@ def resnet50_unet(n_classes, input_height=224, input_width=224, task="segmentati img_input = Input(shape=(input_height, input_width, 3)) - if IMAGE_ORDERING == 'channels_last': - bn_axis = 3 - else: - bn_axis = 1 - - x = ZeroPadding2D((3, 3), data_format=IMAGE_ORDERING)(img_input) - x = Conv2D(64, (7, 7), data_format=IMAGE_ORDERING, strides=(2, 2), kernel_regularizer=l2(weight_decay), - name='conv1')(x) - f1 = x - - x = BatchNormalization(axis=bn_axis, name='bn_conv1')(x) - x = Activation('relu')(x) - x = MaxPooling2D((3, 3), data_format=IMAGE_ORDERING, strides=(2, 2))(x) - - x = conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1)) - x = identity_block(x, 3, [64, 64, 256], stage=2, block='b') - x = identity_block(x, 3, [64, 64, 256], stage=2, block='c') - f2 = one_side_pad(x) - - x = conv_block(x, 3, [128, 128, 512], stage=3, block='a') - x = identity_block(x, 3, [128, 128, 512], stage=3, block='b') - x = identity_block(x, 3, [128, 128, 512], stage=3, block='c') - x = identity_block(x, 3, [128, 128, 512], stage=3, block='d') - f3 = x - - x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a') - x = identity_block(x, 3, [256, 256, 1024], stage=4, block='b') - x = identity_block(x, 3, [256, 256, 1024], stage=4, block='c') - x = identity_block(x, 3, [256, 256, 1024], stage=4, block='d') - x = identity_block(x, 3, [256, 256, 1024], stage=4, block='e') - x = identity_block(x, 3, [256, 256, 1024], stage=4, block='f') - f4 = x - - x = conv_block(x, 3, [512, 512, 2048], stage=5, block='a') - x = identity_block(x, 3, [512, 512, 2048], stage=5, block='b') - x = identity_block(x, 3, [512, 512, 2048], stage=5, block='c') - f5 = x - - if pretraining: - Model(img_input, x).load_weights(RESNET50_WEIGHTS_PATH) + f1, f2, f3, f4, f5 = resnet50(img_input, weight_decay, pretraining) v1024_2048 = Conv2D(1024, (1, 1), padding='same', data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay))( f5) @@ -372,47 +337,7 @@ def vit_resnet50_unet(num_patches, transformer_mlp_head_units = [128, 64] inputs = Input(shape=(input_height, input_width, 3)) - if IMAGE_ORDERING == 'channels_last': - bn_axis = 3 - else: - bn_axis = 1 - - x = ZeroPadding2D((3, 3), data_format=IMAGE_ORDERING)(inputs) - x = Conv2D(64, (7, 7), data_format=IMAGE_ORDERING, strides=(2, 2),kernel_regularizer=l2(weight_decay), name='conv1')(x) - f1 = x - - x = BatchNormalization(axis=bn_axis, name='bn_conv1')(x) - x = Activation('relu')(x) - x = MaxPooling2D((3, 3), data_format=IMAGE_ORDERING, strides=(2, 2))(x) - - x = conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1)) - x = identity_block(x, 3, [64, 64, 256], stage=2, block='b') - x = identity_block(x, 3, [64, 64, 256], stage=2, block='c') - f2 = one_side_pad(x) - - x = conv_block(x, 3, [128, 128, 512], stage=3, block='a') - x = identity_block(x, 3, [128, 128, 512], stage=3, block='b') - x = identity_block(x, 3, [128, 128, 512], stage=3, block='c') - x = identity_block(x, 3, [128, 128, 512], stage=3, block='d') - f3 = x - - x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a') - x = identity_block(x, 3, [256, 256, 1024], stage=4, block='b') - x = identity_block(x, 3, [256, 256, 1024], stage=4, block='c') - x = identity_block(x, 3, [256, 256, 1024], stage=4, block='d') - x = identity_block(x, 3, [256, 256, 1024], stage=4, block='e') - x = identity_block(x, 3, [256, 256, 1024], stage=4, block='f') - f4 = x - - x = conv_block(x, 3, [512, 512, 2048], stage=5, block='a') - x = identity_block(x, 3, [512, 512, 2048], stage=5, block='b') - x = identity_block(x, 3, [512, 512, 2048], stage=5, block='c') - f5 = x - - if pretraining: - model = Model(inputs, x).load_weights(RESNET50_WEIGHTS_PATH) - - #num_patches = x.shape[1]*x.shape[2] + f1, f2, f3, f4, f5 = resnet50(inputs, weight_decay, pretraining) patches = Patches(transformer_patchsize_x, transformer_patchsize_y)(x) # Encode patches. @@ -540,42 +465,9 @@ def vit_resnet50_unet_transformer_before_cnn(num_patches, encoded_patches = Conv2D(3, (1, 1), padding='same', data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay), name='convinput')(encoded_patches) - x = ZeroPadding2D((3, 3), data_format=IMAGE_ORDERING)(encoded_patches) - x = Conv2D(64, (7, 7), data_format=IMAGE_ORDERING, strides=(2, 2),kernel_regularizer=l2(weight_decay), name='conv1')(x) - f1 = x + f1, f2, f3, f4, f5 = resnet50(encoded_patches, weight_decay, pretraining) - x = BatchNormalization(axis=bn_axis, name='bn_conv1')(x) - x = Activation('relu')(x) - x = MaxPooling2D((3, 3), data_format=IMAGE_ORDERING, strides=(2, 2))(x) - - x = conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1)) - x = identity_block(x, 3, [64, 64, 256], stage=2, block='b') - x = identity_block(x, 3, [64, 64, 256], stage=2, block='c') - f2 = one_side_pad(x) - - x = conv_block(x, 3, [128, 128, 512], stage=3, block='a') - x = identity_block(x, 3, [128, 128, 512], stage=3, block='b') - x = identity_block(x, 3, [128, 128, 512], stage=3, block='c') - x = identity_block(x, 3, [128, 128, 512], stage=3, block='d') - f3 = x - - x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a') - x = identity_block(x, 3, [256, 256, 1024], stage=4, block='b') - x = identity_block(x, 3, [256, 256, 1024], stage=4, block='c') - x = identity_block(x, 3, [256, 256, 1024], stage=4, block='d') - x = identity_block(x, 3, [256, 256, 1024], stage=4, block='e') - x = identity_block(x, 3, [256, 256, 1024], stage=4, block='f') - f4 = x - - x = conv_block(x, 3, [512, 512, 2048], stage=5, block='a') - x = identity_block(x, 3, [512, 512, 2048], stage=5, block='b') - x = identity_block(x, 3, [512, 512, 2048], stage=5, block='c') - f5 = x - - if pretraining: - model = Model(encoded_patches, x).load_weights(RESNET50_WEIGHTS_PATH) - - v1024_2048 = Conv2D( 1024 , (1, 1), padding='same', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay))(x) + v1024_2048 = Conv2D( 1024 , (1, 1), padding='same', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay))(f5) v1024_2048 = (BatchNormalization(axis=bn_axis))(v1024_2048) v1024_2048 = Activation('relu')(v1024_2048) @@ -633,47 +525,7 @@ def resnet50_classifier(n_classes,input_height=224,input_width=224,weight_decay= img_input = Input(shape=(input_height,input_width , 3 )) - if IMAGE_ORDERING == 'channels_last': - bn_axis = 3 - else: - bn_axis = 1 - - x = ZeroPadding2D((3, 3), data_format=IMAGE_ORDERING)(img_input) - x = Conv2D(64, (7, 7), data_format=IMAGE_ORDERING, strides=(2, 2),kernel_regularizer=l2(weight_decay), name='conv1')(x) - f1 = x - - x = BatchNormalization(axis=bn_axis, name='bn_conv1')(x) - x = Activation('relu')(x) - x = MaxPooling2D((3, 3) , data_format=IMAGE_ORDERING , strides=(2, 2))(x) - - - x = conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1)) - x = identity_block(x, 3, [64, 64, 256], stage=2, block='b') - x = identity_block(x, 3, [64, 64, 256], stage=2, block='c') - f2 = one_side_pad(x ) - - - x = conv_block(x, 3, [128, 128, 512], stage=3, block='a') - x = identity_block(x, 3, [128, 128, 512], stage=3, block='b') - x = identity_block(x, 3, [128, 128, 512], stage=3, block='c') - x = identity_block(x, 3, [128, 128, 512], stage=3, block='d') - f3 = x - - x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a') - x = identity_block(x, 3, [256, 256, 1024], stage=4, block='b') - x = identity_block(x, 3, [256, 256, 1024], stage=4, block='c') - x = identity_block(x, 3, [256, 256, 1024], stage=4, block='d') - x = identity_block(x, 3, [256, 256, 1024], stage=4, block='e') - x = identity_block(x, 3, [256, 256, 1024], stage=4, block='f') - f4 = x - - x = conv_block(x, 3, [512, 512, 2048], stage=5, block='a') - x = identity_block(x, 3, [512, 512, 2048], stage=5, block='b') - x = identity_block(x, 3, [512, 512, 2048], stage=5, block='c') - f5 = x - - if pretraining: - Model(img_input, x).load_weights(RESNET50_WEIGHTS_PATH) + _, _, _, _, x = resnet50(img_input, weight_decay, pretraining) x = AveragePooling2D((7, 7), name='avg_pool')(x) x = Flatten()(x) @@ -693,43 +545,10 @@ def machine_based_reading_order_model(n_classes,input_height=224,input_width=224 img_input = Input(shape=(input_height,input_width , 3 )) - if IMAGE_ORDERING == 'channels_last': - bn_axis = 3 - else: - bn_axis = 1 - - x1 = ZeroPadding2D((3, 3), data_format=IMAGE_ORDERING)(img_input) - x1 = Conv2D(64, (7, 7), data_format=IMAGE_ORDERING, strides=(2, 2),kernel_regularizer=l2(weight_decay), name='conv1')(x1) - - x1 = BatchNormalization(axis=bn_axis, name='bn_conv1')(x1) - x1 = Activation('relu')(x1) - x1 = MaxPooling2D((3, 3) , data_format=IMAGE_ORDERING , strides=(2, 2))(x1) + _, _, _, _, x = resnet50(img_input, weight_decay, pretraining) - x1 = conv_block(x1, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1)) - x1 = identity_block(x1, 3, [64, 64, 256], stage=2, block='b') - x1 = identity_block(x1, 3, [64, 64, 256], stage=2, block='c') - - x1 = conv_block(x1, 3, [128, 128, 512], stage=3, block='a') - x1 = identity_block(x1, 3, [128, 128, 512], stage=3, block='b') - x1 = identity_block(x1, 3, [128, 128, 512], stage=3, block='c') - x1 = identity_block(x1, 3, [128, 128, 512], stage=3, block='d') - - x1 = conv_block(x1, 3, [256, 256, 1024], stage=4, block='a') - x1 = identity_block(x1, 3, [256, 256, 1024], stage=4, block='b') - x1 = identity_block(x1, 3, [256, 256, 1024], stage=4, block='c') - x1 = identity_block(x1, 3, [256, 256, 1024], stage=4, block='d') - x1 = identity_block(x1, 3, [256, 256, 1024], stage=4, block='e') - x1 = identity_block(x1, 3, [256, 256, 1024], stage=4, block='f') - - x1 = conv_block(x1, 3, [512, 512, 2048], stage=5, block='a') - x1 = identity_block(x1, 3, [512, 512, 2048], stage=5, block='b') - x1 = identity_block(x1, 3, [512, 512, 2048], stage=5, block='c') - - if pretraining: - Model(img_input , x1).load_weights(RESNET50_WEIGHTS_PATH) - - x1 = AveragePooling2D((7, 7), name='avg_pool1')(x1) - flattened = Flatten()(x1) + x = AveragePooling2D((7, 7), name='avg_pool1')(x) + flattened = Flatten()(x) o = Dense(256, activation='relu', name='fc512')(flattened) o=Dropout(0.2)(o) From daa084c3674f0ad66abd08bc9ad8b42634a3dcde Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 17 Feb 2026 17:11:15 +0100 Subject: [PATCH 082/118] training.models: re-use UNet decoder builder code --- src/eynollah/training/models.py | 297 +++++++++----------------------- 1 file changed, 85 insertions(+), 212 deletions(-) diff --git a/src/eynollah/training/models.py b/src/eynollah/training/models.py index 406e937..a03f028 100644 --- a/src/eynollah/training/models.py +++ b/src/eynollah/training/models.py @@ -198,67 +198,82 @@ def resnet50(inputs, weight_decay=1e-6, pretraining=False): return f1, f2, f3, f4, f5 +def unet_decoder(img, f1, f2, f3, f4, f5, n_classes, light=False, task="segmentation", weight_decay=1e-6): + if IMAGE_ORDERING == 'channels_last': + bn_axis = 3 + else: + bn_axis = 1 + + o = Conv2D(512 if light else 1024, (1, 1), padding='same', + data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay))(f5) + o = BatchNormalization(axis=bn_axis)(o) + o = Activation('relu')(o) + + if light: + f4 = Conv2D(512, (1, 1), padding='same', + data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay))(f4) + f4 = BatchNormalization(axis=bn_axis)(f4) + f4 = Activation('relu')(f4) + + o = UpSampling2D((2, 2), data_format=IMAGE_ORDERING)(o) + o = concatenate([o, f4], axis=MERGE_AXIS) + o = ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING)(o) + o = Conv2D(512, (3, 3), padding='valid', + data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay))(o) + o = BatchNormalization(axis=bn_axis)(o) + o = Activation('relu')(o) + + o = UpSampling2D((2, 2), data_format=IMAGE_ORDERING)(o) + o = concatenate([o, f3], axis=MERGE_AXIS) + o = ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING)(o) + o = Conv2D(256, (3, 3), padding='valid', + data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay))(o) + o = BatchNormalization(axis=bn_axis)(o) + o = Activation('relu')(o) + + o = UpSampling2D((2, 2), data_format=IMAGE_ORDERING)(o) + o = concatenate([o, f2], axis=MERGE_AXIS) + o = ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING)(o) + o = Conv2D(128, (3, 3), padding='valid', + data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay))(o) + o = BatchNormalization(axis=bn_axis)(o) + o = Activation('relu')(o) + + o = UpSampling2D((2, 2), data_format=IMAGE_ORDERING)(o) + o = concatenate([o, f1], axis=MERGE_AXIS) + o = ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING)(o) + o = Conv2D(64, (3, 3), padding='valid', + data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay))(o) + o = BatchNormalization(axis=bn_axis)(o) + o = Activation('relu')(o) + + o = UpSampling2D((2, 2), data_format=IMAGE_ORDERING)(o) + o = concatenate([o, img], axis=MERGE_AXIS) + o = ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING)(o) + o = Conv2D(32, (3, 3), padding='valid', + data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay))(o) + o = BatchNormalization(axis=bn_axis)(o) + o = Activation('relu')(o) + + o = Conv2D(n_classes, (1, 1), padding='same', + data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay))(o) + if task == "segmentation": + o = BatchNormalization(axis=bn_axis)(o) + o = Activation('softmax')(o) + else: + o = Activation('sigmoid')(o) + + return Model(img, o) + def resnet50_unet_light(n_classes, input_height=224, input_width=224, task="segmentation", weight_decay=1e-6, pretraining=False): assert input_height % 32 == 0 assert input_width % 32 == 0 img_input = Input(shape=(input_height, input_width, 3)) - f1, f2, f3, f4, f5 = resnet50(img_input, weight_decay, pretraining) - - v512_2048 = Conv2D(512, (1, 1), padding='same', data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay))(f5) - v512_2048 = (BatchNormalization(axis=bn_axis))(v512_2048) - v512_2048 = Activation('relu')(v512_2048) - - v512_1024 = Conv2D(512, (1, 1), padding='same', data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay))(f4) - v512_1024 = (BatchNormalization(axis=bn_axis))(v512_1024) - v512_1024 = Activation('relu')(v512_1024) - - o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(v512_2048) - o = (concatenate([o, v512_1024], axis=MERGE_AXIS)) - o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) - o = (Conv2D(512, (3, 3), padding='valid', data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay)))(o) - o = (BatchNormalization(axis=bn_axis))(o) - o = Activation('relu')(o) - - o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(o) - o = (concatenate([o, f3], axis=MERGE_AXIS)) - o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) - o = (Conv2D(256, (3, 3), padding='valid', data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay)))(o) - o = (BatchNormalization(axis=bn_axis))(o) - o = Activation('relu')(o) - - o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(o) - o = (concatenate([o, f2], axis=MERGE_AXIS)) - o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) - o = (Conv2D(128, (3, 3), padding='valid', data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay)))(o) - o = (BatchNormalization(axis=bn_axis))(o) - o = Activation('relu')(o) - - o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(o) - o = (concatenate([o, f1], axis=MERGE_AXIS)) - o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) - o = (Conv2D(64, (3, 3), padding='valid', data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay)))(o) - o = (BatchNormalization(axis=bn_axis))(o) - o = Activation('relu')(o) - - o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(o) - o = (concatenate([o, img_input], axis=MERGE_AXIS)) - o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) - o = (Conv2D(32, (3, 3), padding='valid', data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay)))(o) - o = (BatchNormalization(axis=bn_axis))(o) - o = Activation('relu')(o) - - o = Conv2D(n_classes, (1, 1), padding='same', data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay))(o) - if task == "segmentation": - o = (BatchNormalization(axis=bn_axis))(o) - o = (Activation('softmax'))(o) - else: - o = (Activation('sigmoid'))(o) - - model = Model(img_input, o) - return model + features = resnet50(img_input, weight_decay=weight_decay, pretraining=pretraining) + return unet_decoder(img_input, *features, n_classes, light=True, task=task, weight_decay=weight_decay) def resnet50_unet(n_classes, input_height=224, input_width=224, task="segmentation", weight_decay=1e-6, pretraining=False): assert input_height % 32 == 0 @@ -266,59 +281,9 @@ def resnet50_unet(n_classes, input_height=224, input_width=224, task="segmentati img_input = Input(shape=(input_height, input_width, 3)) - f1, f2, f3, f4, f5 = resnet50(img_input, weight_decay, pretraining) - - v1024_2048 = Conv2D(1024, (1, 1), padding='same', data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay))( - f5) - v1024_2048 = (BatchNormalization(axis=bn_axis))(v1024_2048) - v1024_2048 = Activation('relu')(v1024_2048) - - o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(v1024_2048) - o = (concatenate([o, f4], axis=MERGE_AXIS)) - o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) - o = (Conv2D(512, (3, 3), padding='valid', data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay)))(o) - o = (BatchNormalization(axis=bn_axis))(o) - o = Activation('relu')(o) - - o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(o) - o = (concatenate([o, f3], axis=MERGE_AXIS)) - o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) - o = (Conv2D(256, (3, 3), padding='valid', data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay)))(o) - o = (BatchNormalization(axis=bn_axis))(o) - o = Activation('relu')(o) - - o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(o) - o = (concatenate([o, f2], axis=MERGE_AXIS)) - o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) - o = (Conv2D(128, (3, 3), padding='valid', data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay)))(o) - o = (BatchNormalization(axis=bn_axis))(o) - o = Activation('relu')(o) - - o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(o) - o = (concatenate([o, f1], axis=MERGE_AXIS)) - o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) - o = (Conv2D(64, (3, 3), padding='valid', data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay)))(o) - o = (BatchNormalization(axis=bn_axis))(o) - o = Activation('relu')(o) - - o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(o) - o = (concatenate([o, img_input], axis=MERGE_AXIS)) - o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) - o = (Conv2D(32, (3, 3), padding='valid', data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay)))(o) - o = (BatchNormalization(axis=bn_axis))(o) - o = Activation('relu')(o) - - o = Conv2D(n_classes, (1, 1), padding='same', data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay))(o) - if task == "segmentation": - o = (BatchNormalization(axis=bn_axis))(o) - o = (Activation('softmax'))(o) - else: - o = (Activation('sigmoid'))(o) - - model = Model(img_input, o) - - return model + features = resnet50(img_input, weight_decay=weight_decay, pretraining=pretraining) + return unet_decoder(img_input, *features, n_classes, light=False, task=task, weight_decay=weight_decay) def vit_resnet50_unet(num_patches, n_classes, @@ -337,9 +302,9 @@ def vit_resnet50_unet(num_patches, transformer_mlp_head_units = [128, 64] inputs = Input(shape=(input_height, input_width, 3)) - f1, f2, f3, f4, f5 = resnet50(inputs, weight_decay, pretraining) + features = resnet50(inputs, weight_decay=weight_decay, pretraining=pretraining) - patches = Patches(transformer_patchsize_x, transformer_patchsize_y)(x) + patches = Patches(transformer_patchsize_x, transformer_patchsize_y)(features[-1]) # Encode patches. encoded_patches = PatchEncoder(num_patches, transformer_projection_dim)(patches) @@ -360,59 +325,16 @@ def vit_resnet50_unet(num_patches, encoded_patches = Add()([x3, x2]) encoded_patches = tf.reshape(encoded_patches, - [-1, x.shape[1], x.shape[2], + [-1, + features[-1].shape[1], + features[-1].shape[2], transformer_projection_dim // (transformer_patchsize_x * transformer_patchsize_y)]) + features[-1] = encoded_patches - v1024_2048 = Conv2D( 1024 , (1, 1), padding='same', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay))(encoded_patches) - v1024_2048 = (BatchNormalization(axis=bn_axis))(v1024_2048) - v1024_2048 = Activation('relu')(v1024_2048) - - o = (UpSampling2D( (2, 2), data_format=IMAGE_ORDERING))(v1024_2048) - o = (concatenate([o, f4],axis=MERGE_AXIS)) - o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) - o = (Conv2D(512, (3, 3), padding='valid', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay)))(o) - o = (BatchNormalization(axis=bn_axis))(o) - o = Activation('relu')(o) - - o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(o) - o = (concatenate([o ,f3], axis=MERGE_AXIS)) - o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) - o = (Conv2D(256, (3, 3), padding='valid', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay)))(o) - o = (BatchNormalization(axis=bn_axis))(o) - o = Activation('relu')(o) - - o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(o) - o = (concatenate([o, f2], axis=MERGE_AXIS)) - o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) - o = (Conv2D(128, (3, 3), padding='valid', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay)))(o) - o = (BatchNormalization(axis=bn_axis))(o) - o = Activation('relu')(o) - - o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(o) - o = (concatenate([o, f1], axis=MERGE_AXIS)) - o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) - o = (Conv2D(64, (3, 3), padding='valid', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay)))(o) - o = (BatchNormalization(axis=bn_axis))(o) - o = Activation('relu')(o) - - o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(o) - o = (concatenate([o, inputs],axis=MERGE_AXIS)) - o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) - o = (Conv2D(32, (3, 3), padding='valid', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay)))(o) - o = (BatchNormalization(axis=bn_axis))(o) - o = Activation('relu')(o) - - o = Conv2D(n_classes, (1, 1), padding='same', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay))(o) - if task == "segmentation": - o = (BatchNormalization(axis=bn_axis))(o) - o = (Activation('softmax'))(o) - else: - o = (Activation('sigmoid'))(o) + o = unet_decoder(inputs, *features, n_classes, task=task, weight_decay=weight_decay) - model = Model(inputs=inputs, outputs=o) - - return model + return Model(inputs, o) def vit_resnet50_unet_transformer_before_cnn(num_patches, n_classes, @@ -431,11 +353,6 @@ def vit_resnet50_unet_transformer_before_cnn(num_patches, transformer_mlp_head_units = [128, 64] inputs = Input(shape=(input_height, input_width, 3)) - if IMAGE_ORDERING == 'channels_last': - bn_axis = 3 - else: - bn_axis = 1 - patches = Patches(transformer_patchsize_x, transformer_patchsize_y)(inputs) # Encode patches. encoded_patches = PatchEncoder(num_patches, transformer_projection_dim)(patches) @@ -463,59 +380,15 @@ def vit_resnet50_unet_transformer_before_cnn(num_patches, transformer_projection_dim // (transformer_patchsize_x * transformer_patchsize_y)]) - encoded_patches = Conv2D(3, (1, 1), padding='same', data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay), name='convinput')(encoded_patches) + encoded_patches = Conv2D(3, (1, 1), padding='same', + data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay), + name='convinput')(encoded_patches) - f1, f2, f3, f4, f5 = resnet50(encoded_patches, weight_decay, pretraining) + features = resnet50(encoded_patches, weight_decay=weight_decay, pretraining=pretraining) - v1024_2048 = Conv2D( 1024 , (1, 1), padding='same', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay))(f5) - v1024_2048 = (BatchNormalization(axis=bn_axis))(v1024_2048) - v1024_2048 = Activation('relu')(v1024_2048) + o = unet_decoder(inputs, *features, n_classes, task=task, weight_decay=weight_decay) - o = (UpSampling2D( (2, 2), data_format=IMAGE_ORDERING))(v1024_2048) - o = (concatenate([o, f4],axis=MERGE_AXIS)) - o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) - o = (Conv2D(512, (3, 3), padding='valid', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay)))(o) - o = (BatchNormalization(axis=bn_axis))(o) - o = Activation('relu')(o) - - o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(o) - o = (concatenate([o ,f3], axis=MERGE_AXIS)) - o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) - o = (Conv2D(256, (3, 3), padding='valid', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay)))(o) - o = (BatchNormalization(axis=bn_axis))(o) - o = Activation('relu')(o) - - o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(o) - o = (concatenate([o, f2], axis=MERGE_AXIS)) - o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) - o = (Conv2D(128, (3, 3), padding='valid', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay)))(o) - o = (BatchNormalization(axis=bn_axis))(o) - o = Activation('relu')(o) - - o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(o) - o = (concatenate([o, f1], axis=MERGE_AXIS)) - o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) - o = (Conv2D(64, (3, 3), padding='valid', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay)))(o) - o = (BatchNormalization(axis=bn_axis))(o) - o = Activation('relu')(o) - - o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(o) - o = (concatenate([o, inputs],axis=MERGE_AXIS)) - o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) - o = (Conv2D(32, (3, 3), padding='valid', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay)))(o) - o = (BatchNormalization(axis=bn_axis))(o) - o = Activation('relu')(o) - - o = Conv2D(n_classes, (1, 1), padding='same', data_format=IMAGE_ORDERING,kernel_regularizer=l2(weight_decay))(o) - if task == "segmentation": - o = (BatchNormalization(axis=bn_axis))(o) - o = (Activation('softmax'))(o) - else: - o = (Activation('sigmoid'))(o) - - model = Model(inputs=inputs, outputs=o) - - return model + return Model(inputs, o) def resnet50_classifier(n_classes,input_height=224,input_width=224,weight_decay=1e-6,pretraining=False): include_top=True From 9b66867c217ed17c8d8c30f45cbcc35824a2eb7a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 17 Feb 2026 17:35:20 +0100 Subject: [PATCH 083/118] training.models: re-use transformer builder code --- src/eynollah/training/models.py | 109 ++++++++++++++++---------------- 1 file changed, 53 insertions(+), 56 deletions(-) diff --git a/src/eynollah/training/models.py b/src/eynollah/training/models.py index a03f028..4af4949 100644 --- a/src/eynollah/training/models.py +++ b/src/eynollah/training/models.py @@ -285,6 +285,41 @@ def resnet50_unet(n_classes, input_height=224, input_width=224, task="segmentati return unet_decoder(img_input, *features, n_classes, light=False, task=task, weight_decay=weight_decay) +def transformer_block(img, + num_patches, + patchsize_x, + patchsize_y, + mlp_head_units, + n_layers, + num_heads, + projection_dim): + patches = Patches(patchsize_x, patchsize_y)(img) + # Encode patches. + encoded_patches = PatchEncoder(num_patches, projection_dim)(patches) + + for _ in range(n_layers): + # Layer normalization 1. + x1 = LayerNormalization(epsilon=1e-6)(encoded_patches) + # Create a multi-head attention layer. + attention_output = MultiHeadAttention(num_heads=num_heads, + key_dim=projection_dim, + dropout=0.1)(x1, x1) + # Skip connection 1. + x2 = Add()([attention_output, encoded_patches]) + # Layer normalization 2. + x3 = LayerNormalization(epsilon=1e-6)(x2) + # MLP. + x3 = mlp(x3, hidden_units=mlp_head_units, dropout_rate=0.1) + # Skip connection 2. + encoded_patches = Add()([x3, x2]) + + encoded_patches = tf.reshape(encoded_patches, + [-1, + img.shape[1], + img.shape[2], + projection_dim // (patchsize_x * patchsize_y)]) + return encoded_patches + def vit_resnet50_unet(num_patches, n_classes, transformer_patchsize_x, @@ -304,33 +339,14 @@ def vit_resnet50_unet(num_patches, features = resnet50(inputs, weight_decay=weight_decay, pretraining=pretraining) - patches = Patches(transformer_patchsize_x, transformer_patchsize_y)(features[-1]) - # Encode patches. - encoded_patches = PatchEncoder(num_patches, transformer_projection_dim)(patches) - - for _ in range(transformer_layers): - # Layer normalization 1. - x1 = LayerNormalization(epsilon=1e-6)(encoded_patches) - # Create a multi-head attention layer. - attention_output = MultiHeadAttention( - num_heads=transformer_num_heads, key_dim=transformer_projection_dim, dropout=0.1 - )(x1, x1) - # Skip connection 1. - x2 = Add()([attention_output, encoded_patches]) - # Layer normalization 2. - x3 = LayerNormalization(epsilon=1e-6)(x2) - # MLP. - x3 = mlp(x3, hidden_units=transformer_mlp_head_units, dropout_rate=0.1) - # Skip connection 2. - encoded_patches = Add()([x3, x2]) - - encoded_patches = tf.reshape(encoded_patches, - [-1, - features[-1].shape[1], - features[-1].shape[2], - transformer_projection_dim // (transformer_patchsize_x * - transformer_patchsize_y)]) - features[-1] = encoded_patches + features[-1] = transformer_block(features[-1], + num_patches, + transformer_patchsize_x, + transformer_patchsize_y, + transformer_mlp_head_units, + transformer_layers, + transformer_num_heads, + transformer_projection_dim) o = unet_decoder(inputs, *features, n_classes, task=task, weight_decay=weight_decay) @@ -352,38 +368,19 @@ def vit_resnet50_unet_transformer_before_cnn(num_patches, if transformer_mlp_head_units is None: transformer_mlp_head_units = [128, 64] inputs = Input(shape=(input_height, input_width, 3)) - - patches = Patches(transformer_patchsize_x, transformer_patchsize_y)(inputs) - # Encode patches. - encoded_patches = PatchEncoder(num_patches, transformer_projection_dim)(patches) - - for _ in range(transformer_layers): - # Layer normalization 1. - x1 = LayerNormalization(epsilon=1e-6)(encoded_patches) - # Create a multi-head attention layer. - attention_output = MultiHeadAttention( - num_heads=transformer_num_heads, key_dim=transformer_projection_dim, dropout=0.1 - )(x1, x1) - # Skip connection 1. - x2 = Add()([attention_output, encoded_patches]) - # Layer normalization 2. - x3 = LayerNormalization(epsilon=1e-6)(x2) - # MLP. - x3 = mlp(x3, hidden_units=transformer_mlp_head_units, dropout_rate=0.1) - # Skip connection 2. - encoded_patches = Add()([x3, x2]) - - encoded_patches = tf.reshape(encoded_patches, - [-1, - input_height, - input_width, - transformer_projection_dim // (transformer_patchsize_x * - transformer_patchsize_y)]) - + + encoded_patches = transformer_block(inputs, + num_patches, + transformer_patchsize_x, + transformer_patchsize_y, + transformer_mlp_head_units, + transformer_layers, + transformer_num_heads, + transformer_projection_dim) encoded_patches = Conv2D(3, (1, 1), padding='same', data_format=IMAGE_ORDERING, kernel_regularizer=l2(weight_decay), name='convinput')(encoded_patches) - + features = resnet50(encoded_patches, weight_decay=weight_decay, pretraining=pretraining) o = unet_decoder(inputs, *features, n_classes, task=task, weight_decay=weight_decay) From 7bef8fa95abc7a73ffa6648dd3ce936166818484 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 17 Feb 2026 17:24:07 +0100 Subject: [PATCH 084/118] training.train: add verbose=1 consistently --- src/eynollah/training/train.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index 05a7346..87b3551 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -471,6 +471,7 @@ def run(_config, model.fit( train_gen.prefetch(tf.data.AUTOTUNE), # .repeat()?? validation_data=val_gen.prefetch(tf.data.AUTOTUNE), + verbose=1, epochs=n_epochs, callbacks=callbacks, initial_epoch=index_start) @@ -544,6 +545,7 @@ def run(_config, model.fit( train_ds, #validation_data=test_ds, + verbose=1, epochs=n_epochs, callbacks=callbacks, initial_epoch=index_start) From c1b5cc92af60963a31965234bf44634dc24b7e7a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 17 Feb 2026 17:43:57 +0100 Subject: [PATCH 085/118] fix typo in 7562317d --- src/eynollah/training/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index 87b3551..fbbf920 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -360,7 +360,7 @@ def run(_config, if task == "enhancement": assert not is_loss_soft_dice, "for enhancement, soft_dice loss does not apply" - assert not weighted_dice, "for enhancement, weighted loss does not apply" + assert not weighted_loss, "for enhancement, weighted loss does not apply" if continue_training: custom_objects = dict() if is_loss_soft_dice: From 6a4163ae56f92c5182662da8f704e76577eb5bea Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 17 Feb 2026 17:48:43 +0100 Subject: [PATCH 086/118] fix typo in 27f43c17 --- src/eynollah/training/train.py | 2 +- src/eynollah/training/utils.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index fbbf920..f6117f7 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -523,7 +523,7 @@ def run(_config, ) train_ds = tf.data.Dataset.from_generator(gen) train_ds = train_ds.padded_batch(n_batch, - padded_shapes=([image_height, image_width, 3], [None]), + padded_shapes=([input_height, input_width, 3], [None]), padding_values=(0, padding_token), drop_remainder=True, #num_parallel_calls=tf.data.AUTOTUNE, diff --git a/src/eynollah/training/utils.py b/src/eynollah/training/utils.py index f2f4bdc..4b6033e 100644 --- a/src/eynollah/training/utils.py +++ b/src/eynollah/training/utils.py @@ -997,12 +997,12 @@ def preprocess_img(img, input_height, input_width) if padding_black: - yield from get_patches(do_padding_black(img), + yield from get_patches(do_padding_with_color(img, 'black'), do_padding_label(lab), input_height, input_width) if padding_white: - yield from get_patches(do_padding_white(img), + yield from get_patches(do_padding_with_color(img, 'white'), do_padding_label(lab), input_height, input_width) @@ -1129,7 +1129,7 @@ def preprocess_img_ocr( return scale_padd_image_for_ocr(img, input_height, input_width).astype(np.float32) / 255. #lab = vectorize_label(lab, char_to_num, padding_token, max_len) # now padded at Dataset.padded_batch - lab = char_to_num(tf.strings.unicode_split(label, input_encoding="UTF-8")) + lab = char_to_num(tf.strings.unicode_split(lab, input_encoding="UTF-8")) yield scale_image(img), lab #to_yield = {"image": ret_x, "label": ret_y} From 67fca82f384074028445f47fbcfdae44534668e1 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 17 Feb 2026 18:06:08 +0100 Subject: [PATCH 087/118] fix missing import in 27f43c17 --- src/eynollah/training/models.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/eynollah/training/models.py b/src/eynollah/training/models.py index 4af4949..ba61764 100644 --- a/src/eynollah/training/models.py +++ b/src/eynollah/training/models.py @@ -441,7 +441,7 @@ def cnn_rnn_ocr_model(image_height=None, image_width=None, n_classes=None, max_s x = Conv2D(64,kernel_size=(3,3),padding="same")(x) x = BatchNormalization(name="bn2")(x) x = Activation("relu", name="relu2")(x) - x = MaxPool2D(pool_size=(1,2),strides=(1,2))(x) + x = MaxPooling2D(pool_size=(1,2),strides=(1,2))(x) x = Conv2D(128,kernel_size=(3,3),padding="same")(x) x = BatchNormalization(name="bn3")(x) @@ -449,7 +449,7 @@ def cnn_rnn_ocr_model(image_height=None, image_width=None, n_classes=None, max_s x = Conv2D(128,kernel_size=(3,3),padding="same")(x) x = BatchNormalization(name="bn4")(x) x = Activation("relu", name="relu4")(x) - x = MaxPool2D(pool_size=(1,2),strides=(1,2))(x) + x = MaxPooling2D(pool_size=(1,2),strides=(1,2))(x) x = Conv2D(256,kernel_size=(3,3),padding="same")(x) x = BatchNormalization(name="bn5")(x) @@ -457,7 +457,7 @@ def cnn_rnn_ocr_model(image_height=None, image_width=None, n_classes=None, max_s x = Conv2D(256,kernel_size=(3,3),padding="same")(x) x = BatchNormalization(name="bn6")(x) x = Activation("relu", name="relu6")(x) - x = MaxPool2D(pool_size=(2,2),strides=(2,2))(x) + x = MaxPooling2D(pool_size=(2,2),strides=(2,2))(x) x = Conv2D(image_width,kernel_size=(3,3),padding="same")(x) x = BatchNormalization(name="bn7")(x) @@ -465,8 +465,8 @@ def cnn_rnn_ocr_model(image_height=None, image_width=None, n_classes=None, max_s x = Conv2D(image_width,kernel_size=(16,1))(x) x = BatchNormalization(name="bn8")(x) x = Activation("relu", name="relu8")(x) - x2d = MaxPool2D(pool_size=(1,2),strides=(1,2))(x) - x4d = MaxPool2D(pool_size=(1,2),strides=(1,2))(x2d) + x2d = MaxPooling2D(pool_size=(1,2),strides=(1,2))(x) + x4d = MaxPooling2D(pool_size=(1,2),strides=(1,2))(x2d) new_shape = (x.shape[1]*x.shape[2], x.shape[3]) From 5f713336495a6d392027637a241247d4dd355c79 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 17 Feb 2026 18:11:49 +0100 Subject: [PATCH 088/118] fix missing import in 49261fa9 --- src/eynollah/training/inference.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/eynollah/training/inference.py b/src/eynollah/training/inference.py index c38b79f..2be937d 100644 --- a/src/eynollah/training/inference.py +++ b/src/eynollah/training/inference.py @@ -17,6 +17,7 @@ import xml.etree.ElementTree as ET os.environ['TF_USE_LEGACY_KERAS'] = '1' # avoid Keras 3 after TF 2.15 import tensorflow as tf from tensorflow.keras.models import Model, load_model +from tensorflow.keras.layers import StringLookup from .gt_gen_utils import ( filter_contours_area_of_image, From f61effe8ce56e4dd4ebb2d9380b51946dfbac96a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 17 Feb 2026 18:20:58 +0100 Subject: [PATCH 089/118] fix typo in c8240905 --- src/eynollah/training/gt_gen_utils.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/eynollah/training/gt_gen_utils.py b/src/eynollah/training/gt_gen_utils.py index 8204a8e..d5ad4d9 100644 --- a/src/eynollah/training/gt_gen_utils.py +++ b/src/eynollah/training/gt_gen_utils.py @@ -238,12 +238,11 @@ def update_region_contours(co_text, img_boundary, erosion_rate, dilation_rate, y con_eroded = return_contours_of_interested_region(img_boundary_in,pixel, min_size ) try: - if len(con_eroded)>1: - cnt_size = np.array([cv2.contourArea(con_eroded[j]) for j in range(len(con_eroded))]) - cnt = contours[np.argmax(cnt_size)] - co_text_eroded.append(cnt) + if len(con_eroded) > 1: + largest = np.argmax(list(map(cv2.contourArea, con_eroded))) else: - co_text_eroded.append(con_eroded[0]) + largest = 0 + co_text_eroded.append(con_eroded[largest]) except: co_text_eroded.append(con) From 003c88f18ab513c3622bbc12f3a2bd44e75bd8f3 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 17 Feb 2026 18:23:32 +0100 Subject: [PATCH 090/118] fix double import in 82266f82 --- src/eynollah/cli/__init__.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/eynollah/cli/__init__.py b/src/eynollah/cli/__init__.py index 05dafa1..43ed046 100644 --- a/src/eynollah/cli/__init__.py +++ b/src/eynollah/cli/__init__.py @@ -2,14 +2,12 @@ # this must be the first import of the CLI! from ..eynollah_imports import imported_libs -from .cli_models import models_cli -from .cli_binarize import binarize_cli - from .cli import main from .cli_binarize import binarize_cli from .cli_enhance import enhance_cli from .cli_extract_images import extract_images_cli from .cli_layout import layout_cli +from .cli_models import models_cli from .cli_ocr import ocr_cli from .cli_readingorder import readingorder_cli From a9496bbc7079d11706e34d1fcef4a0269fe23117 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 17 Feb 2026 18:39:30 +0100 Subject: [PATCH 091/118] enhancer/mbreorder: use std Keras data loader for classification --- src/eynollah/image_enhancer.py | 6 ++++-- src/eynollah/mb_ro_on_layout.py | 4 +++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/eynollah/image_enhancer.py b/src/eynollah/image_enhancer.py index babbd55..67145a3 100644 --- a/src/eynollah/image_enhancer.py +++ b/src/eynollah/image_enhancer.py @@ -15,11 +15,13 @@ from pathlib import Path import gc import cv2 -from keras.models import Model import numpy as np -import tensorflow as tf # type: ignore from skimage.morphology import skeletonize +os.environ['TF_USE_LEGACY_KERAS'] = '1' # avoid Keras 3 after TF 2.15 +import tensorflow as tf # type: ignore +from tensorflow.keras.models import Model + from .model_zoo import EynollahModelZoo from .utils.resize import resize_image from .utils.pil_cv2 import pil2cv diff --git a/src/eynollah/mb_ro_on_layout.py b/src/eynollah/mb_ro_on_layout.py index eec544c..22fe97b 100644 --- a/src/eynollah/mb_ro_on_layout.py +++ b/src/eynollah/mb_ro_on_layout.py @@ -14,10 +14,12 @@ from pathlib import Path import xml.etree.ElementTree as ET import cv2 -from keras.models import Model import numpy as np import statistics + +os.environ['TF_USE_LEGACY_KERAS'] = '1' # avoid Keras 3 after TF 2.15 import tensorflow as tf +from tensorflow.keras.models import Model from .model_zoo import EynollahModelZoo from .utils.resize import resize_image From 56833b3f55c669a07c96ccbfaaf6f99cd0f16bcb Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 24 Feb 2026 16:46:19 +0100 Subject: [PATCH 092/118] =?UTF-8?q?training:=20fix=20data=20representation?= =?UTF-8?q?=20in=207888fa5=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (Eynollah models expet BGR/float instead of RGB/int) --- src/eynollah/training/train.py | 37 ++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index f6117f7..4d0b317 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -435,6 +435,19 @@ def run(_config, sparse_y_true=False, sparse_y_pred=False)]) + def _to_cv2float(img): + # rgb→bgr and uint8→float, as expected by Eynollah models + return tf.cast(tf.reverse(img, [-1]), tf.float32) / 255 + def _to_intrgb(img): + # bgr→rgb and float→uint8 for plotting + return tf.reverse(tf.cast(img * 255, tf.uint8), [-1]) + def _to_categorical(seg): + seg = tf.cast(seg * 255, tf.int8) + # gt_gen_utils/pagexml2label uses peculiar pseudo-RGB/index colors + #seg = tf.image.rgb_to_grayscale(seg) + seg = tf.gather(seg, [0], axis=-1) + seg = tf.squeeze(seg, axis=-1) + return one_hot(seg, n_classes) def get_dataset(dir_imgs, dir_labs, shuffle=None): gen_kwargs = dict(labels=None, label_mode=None, @@ -452,25 +465,27 @@ def run(_config, ) img_gen = image_dataset_from_directory(dir_imgs, **gen_kwargs) lab_gen = image_dataset_from_directory(dir_labs, **gen_kwargs) + img_gen = img_gen.map(_to_cv2float) + lab_gen = lab_gen.map(_to_cv2float) if task in ["segmentation", "binarization"]: - @tf.function - def to_categorical(seg): - seg = tf.image.rgb_to_grayscale(seg) - seg = tf.cast(seg, tf.int8) - seg = tf.squeeze(seg, axis=-1) - return one_hot(seg, n_classes) - lab_gen = lab_gen.map(to_categorical) + lab_gen = lab_gen.map(_to_categorical) return tf.data.Dataset.zip(img_gen, lab_gen).rebatch(n_batch, drop_remainder=True) train_gen = get_dataset(dir_flow_train_imgs, dir_flow_train_labels, shuffle=np.random.randint(1e6)) - val_gen = get_dataset(dir_flow_eval_imgs, dir_flow_eval_labels) - callbacks = [TensorBoard(os.path.join(dir_output, 'logs'), write_graph=False), SaveWeightsAfterSteps(0, dir_output, _config)] + valdn_gen = get_dataset(dir_flow_eval_imgs, dir_flow_eval_labels) + train_steps = len(os.listdir(dir_flow_train_imgs)) // n_batch + valdn_steps = len(os.listdir(dir_flow_eval_imgs)) // n_batch + _log.info("training on %d batches in %d epochs", train_steps, n_epochs) + _log.info("validating on %d batches", valdn_steps) + if save_interval: callbacks.append(SaveWeightsAfterSteps(save_interval, dir_output, _config)) model.fit( - train_gen.prefetch(tf.data.AUTOTUNE), # .repeat()?? - validation_data=val_gen.prefetch(tf.data.AUTOTUNE), + train_gen.prefetch(tf.data.AUTOTUNE), + steps_per_epoch=train_steps, + validation_data=valdn_gen.prefetch(tf.data.AUTOTUNE), + validation_steps=valdn_steps, verbose=1, epochs=n_epochs, callbacks=callbacks, From 18607e0f485883b965a9e154a35693dd886d381e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 24 Feb 2026 17:00:48 +0100 Subject: [PATCH 093/118] training: plot predictions to TB logs along with training/testing --- src/eynollah/training/train.py | 78 +++++++++++++++++++++++++++++++++- 1 file changed, 76 insertions(+), 2 deletions(-) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index 4d0b317..5305ee3 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -74,6 +74,79 @@ def configuration(): except: print("no GPU device available", file=sys.stderr) +@tf.function +def plot_layout_tf(in_: tf.Tensor, out:tf.Tensor) -> tf.Tensor: + """ + Implements training.inference.SBBPredict.visualize_model_output for TF + (effectively plotting the layout segmentation map on the input image). + + In doing so, also converts: + - from Eynollah's BGR/float on the input side + - to std RGB/int format on the output side + """ + # in_: [B, H, W, 3] (BGR float) + image = in_[..., ::-1] * 255 + # out: [B, H, W, C] + lab = tf.math.argmax(out, axis=-1) + # lab: [B, H, W] + colors = tf.constant([[255, 255, 255], + [255, 0, 0], + [255, 125, 0], + [255, 0, 125], + [125, 125, 125], + [125, 125, 0], + [0, 125, 255], + [0, 125, 0], + [125, 125, 125], + [0, 125, 255], + [125, 0, 125], + [0, 255, 0], + [0, 0, 255], + [0, 255, 255], + [255, 125, 125], + [255, 0, 255]]) + layout = tf.gather(colors, lab) + # layout: [B, H, W, 3] + image = tf.cast(image, tf.float32) + layout = tf.cast(layout, tf.float32) + #weighted = image * 0.5 + layout * 0.1 (too dark) + weighted = image * 0.9 + layout * 0.1 + return tf.cast(weighted, tf.uint8) + +# plot predictions on train and test set during every epoch +class TensorBoardPlotter(TensorBoard): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.model_call = None + def on_epoch_begin(self, epoch, logs=None): + super().on_epoch_begin(epoch, logs=logs) + self.model_call = self.model.call + @tf.function + def new_call(inputs, **kwargs): + outputs = self.model_call(inputs, **kwargs) + images = plot_layout_tf(inputs, outputs) + self.plot(images, training=kwargs.get('training', None), epoch=epoch) + return outputs + self.model.call = new_call + def on_epoch_end(self, epoch, logs=None): + # re-instate (so ModelCheckpoint does not see our override call) + self.model.call = self.model_call + # force rebuild of tf.function (so Python binding for epoch gets re-evaluated) + self.model.train_function = self.model.make_train_function(True) + self.model.test_function = self.model.make_test_function(True) + super().on_epoch_end(epoch, logs=logs) + def plot(self, images, training=None, epoch=0): + if training: + writer = self._train_writer + mode, step = "train", self._train_step.read_value() + else: + writer = self._val_writer + mode, step = "test", self._val_step.read_value() + family = "epoch_%03d" % (1 + epoch) + with writer.as_default(): + # used to be family kwarg for tf.summary.image name prefix + with tf.name_scope(family): + tf.summary.image(mode, images, step=step, max_outputs=len(images)) def get_dirs_or_files(input_data): image_input, labels_input = os.path.join(input_data, 'images/'), os.path.join(input_data, 'labels/') @@ -471,14 +544,15 @@ def run(_config, lab_gen = lab_gen.map(_to_categorical) return tf.data.Dataset.zip(img_gen, lab_gen).rebatch(n_batch, drop_remainder=True) train_gen = get_dataset(dir_flow_train_imgs, dir_flow_train_labels, shuffle=np.random.randint(1e6)) - callbacks = [TensorBoard(os.path.join(dir_output, 'logs'), write_graph=False), - SaveWeightsAfterSteps(0, dir_output, _config)] valdn_gen = get_dataset(dir_flow_eval_imgs, dir_flow_eval_labels) train_steps = len(os.listdir(dir_flow_train_imgs)) // n_batch valdn_steps = len(os.listdir(dir_flow_eval_imgs)) // n_batch _log.info("training on %d batches in %d epochs", train_steps, n_epochs) _log.info("validating on %d batches", valdn_steps) + callbacks = [TensorBoardPlotter(os.path.join(dir_output, 'logs'), write_graph=False), + SaveWeightsAfterSteps(0, dir_output, _config), + ] if save_interval: callbacks.append(SaveWeightsAfterSteps(save_interval, dir_output, _config)) model.fit( From abf111de76957ed34824c8f20f96d023505bacc0 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 24 Feb 2026 17:03:21 +0100 Subject: [PATCH 094/118] training: add metric for (same) number of connected components (in trying to capture region instance separability) --- src/eynollah/training/train.py | 49 ++++++++++++++++++++++++++++++---- train/requirements.txt | 1 + 2 files changed, 45 insertions(+), 5 deletions(-) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index 5305ee3..233c6a4 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -14,6 +14,7 @@ from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard from tensorflow.keras.layers import StringLookup from tensorflow.keras.utils import image_dataset_from_directory from tensorflow.keras.backend import one_hot +from tensorflow_addons.image import connected_components from sacred import Experiment from sacred.config import create_captured_function @@ -74,6 +75,42 @@ def configuration(): except: print("no GPU device available", file=sys.stderr) +def num_connected_components_regression(alpha: float): + """ + metric/loss function capturing the separability of segmentation maps + + For both sides (true and predicted, resp.), computes + 1. the argmax() of class-wise softmax input (i.e. the segmentation map) + 2. the connected components (i.e. the instance label map) + 3. the max() (i.e. the highest label = nr of components) + + Then calculates a regression formula between those two targets: + - overall mean squared (to incentivise exact fit) + - additive component (to incentivise more over less segments; + this prevents neighbours of spilling into each other; + oversegmentation is usually not as bad as undersegmentation) + """ + def metric(y_true, y_pred): + # [B, H, W, C] + l_true = tf.math.argmax(y_true, axis=-1) + l_pred = tf.math.argmax(y_pred, axis=-1) + # [B, H, W] + c_true = connected_components(l_true) + c_pred = connected_components(l_pred) + # [B, H, W] + n_batch = tf.shape(y_true)[0] + c_true = tf.reshape(c_true, (n_batch, -1)) + c_pred = tf.reshape(c_pred, (n_batch, -1)) + # [B, H*W] + n_true = tf.math.reduce_max(c_true, axis=1) + n_pred = tf.math.reduce_max(c_pred, axis=1) + # [B] + diff = tf.cast(n_true - n_pred, tf.float32) + return tf.reduce_mean(tf.math.sqrt(tf.math.square(diff) + alpha * diff), axis=-1) + + metric.__name__ = 'nCC' + return metric + @tf.function def plot_layout_tf(in_: tf.Tensor, out:tf.Tensor) -> tf.Tensor: """ @@ -502,11 +539,13 @@ def run(_config, loss = 'mean_squared_error' model.compile(loss=loss, optimizer=Adam(learning_rate=learning_rate), - metrics=['accuracy', MeanIoU(n_classes, - name='iou', - ignore_class=0, - sparse_y_true=False, - sparse_y_pred=False)]) + metrics=['accuracy', + num_connected_components_regression(0.1), + MeanIoU(n_classes, + name='iou', + ignore_class=0, + sparse_y_true=False, + sparse_y_pred=False)]) def _to_cv2float(img): # rgb→bgr and uint8→float, as expected by Eynollah models diff --git a/train/requirements.txt b/train/requirements.txt index 8ad884d..6f23d76 100644 --- a/train/requirements.txt +++ b/train/requirements.txt @@ -4,3 +4,4 @@ numpy tqdm imutils scipy +tensorflow-addons # for connected_components From 658dade0d49ec7e96c42de55b0bddcff0eb51561 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 24 Feb 2026 20:36:00 +0100 Subject: [PATCH 095/118] training.config_params: `flip_index` needed for `scaling_flip`, too --- src/eynollah/training/train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index 233c6a4..07c87d0 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -237,8 +237,6 @@ def config_params(): augmentation = False # To apply any kind of augmentation, this parameter must be set to true. if augmentation: flip_aug = False # Whether different types of flipping will be applied to the image. Requires "flip_index" setting. - if flip_aug: - flip_index = None # List of codes (as in cv2.flip) for flip augmentation. blur_aug = False # Whether images will be blurred. Requires "blur_k" setting. if blur_aug: blur_k = None # Method of blurring (gauss, median or blur). @@ -254,6 +252,8 @@ def config_params(): scaling_flip = False # Whether a combination of scaling and flipping will be applied to the image. if scaling or scaling_brightness or scaling_bluring or scaling_binarization or scaling_flip: scales = None # Scale patches for augmentation. + if flip_aug or scaling_flip: + flip_index = None # List of codes (as in cv2.flip) for flip augmentation. shifting = False brightening = False # Whether images will be brightened. Requires "brightness" setting. if brightening: From 20a3672be322e5c51eb9d43d85934ad6e8bef806 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 24 Feb 2026 20:37:44 +0100 Subject: [PATCH 096/118] training.utils.preprocess_imgs: fix file shuffling in 27f43c17 --- src/eynollah/training/utils.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/eynollah/training/utils.py b/src/eynollah/training/utils.py index 4b6033e..f059354 100644 --- a/src/eynollah/training/utils.py +++ b/src/eynollah/training/utils.py @@ -779,9 +779,10 @@ def preprocess_imgs(config, # override keys from call config.update(kwargs) - seed = random.random() - random.shuffle(imgs_list, random=lambda: seed) - random.shuffle(labs_list, random=lambda: seed) + seed = random.getstate() + random.shuffle(imgs_list) + random.setstate(seed) + random.shuffle(labs_list) # labs_list not used because stem matching more robust indexer = 0 From 86b009bc319835fc6b4888b1f6c215a7d7b7a15b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 24 Feb 2026 20:41:08 +0100 Subject: [PATCH 097/118] training.utils.preprocess_imgs: fix file name stemming 27f43c17 --- src/eynollah/training/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/eynollah/training/utils.py b/src/eynollah/training/utils.py index f059354..818fd14 100644 --- a/src/eynollah/training/utils.py +++ b/src/eynollah/training/utils.py @@ -787,8 +787,8 @@ def preprocess_imgs(config, # labs_list not used because stem matching more robust indexer = 0 for img, lab in tqdm(zip(imgs_list, labs_list)): - img = cv2.imread(os.path.join(dir_img, img)) img_name = os.path.splitext(img)[0] + img = cv2.imread(os.path.join(dir_img, img)) if config['task'] in ["segmentation", "binarization"]: # assert lab == img_name + '.png' lab = cv2.imread(os.path.join(dir_lab, img_name + '.png')) From 92fc2bd815b6bbfe9fbe5200240315813b51b08c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 24 Feb 2026 20:42:08 +0100 Subject: [PATCH 098/118] training.train: fix data batching for OCR in 27f43c17 --- src/eynollah/training/train.py | 23 ++++---- src/eynollah/training/utils.py | 105 +++++++++++++++++---------------- 2 files changed, 63 insertions(+), 65 deletions(-) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index 07c87d0..6d104dc 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -618,11 +618,6 @@ def run(_config, padding_token = len(characters) + 5 # Mapping characters to integers. char_to_num = StringLookup(vocabulary=list(characters), mask_token=None) - - # Mapping integers back to original characters. - ##num_to_char = StringLookup( - ##vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True - ##) n_classes = len(char_to_num.get_vocabulary()) + 2 if continue_training: @@ -649,21 +644,23 @@ def run(_config, char_to_num=char_to_num, padding_token=padding_token ) - train_ds = tf.data.Dataset.from_generator(gen) - train_ds = train_ds.padded_batch(n_batch, - padded_shapes=([input_height, input_width, 3], [None]), - padding_values=(0, padding_token), - drop_remainder=True, - #num_parallel_calls=tf.data.AUTOTUNE, + train_ds = (tf.data.Dataset.from_generator(gen, (tf.float32, tf.int64)) + .padded_batch(n_batch, + padded_shapes=([input_height, input_width, 3], [None]), + padding_values=(None, tf.constant(padding_token, dtype=tf.int64)), + drop_remainder=True, + #num_parallel_calls=tf.data.AUTOTUNE, + ) + .map(lambda x, y: {"image": x, "label": y}) + .prefetch(tf.data.AUTOTUNE) ) - train_ds = train_ds.prefetch(tf.data.AUTOTUNE) #initial_learning_rate = 1e-4 #decay_steps = int (n_epochs * ( len_dataset / n_batch )) #alpha = 0.01 #lr_schedule = 1e-4 #tf.keras.optimizers.schedules.CosineDecay(initial_learning_rate, decay_steps, alpha) - opt = tf.keras.optimizers.Adam(learning_rate=learning_rate) + opt = Adam(learning_rate=learning_rate) model.compile(optimizer=opt) # rs: loss seems to be (ctc_batch_cost) in last layer callbacks = [TensorBoard(os.path.join(dir_output, 'logs'), write_graph=False), diff --git a/src/eynollah/training/utils.py b/src/eynollah/training/utils.py index 818fd14..02a1ca5 100644 --- a/src/eynollah/training/utils.py +++ b/src/eynollah/training/utils.py @@ -1073,58 +1073,59 @@ def preprocess_img(img, scaler=sc_ind) def preprocess_img_ocr( - img, - img_name, - lab, - char_to_num=None, - padding_token=-1, - max_len=500, - n_batch=1, - input_height=None, - input_width=None, - augmentation=False, - color_padding_rotation=None, - thetha_padd=None, - padd_colors=None, - rotation_not_90=None, - thetha=None, - padding_white=None, - white_padds=None, - degrading=False, - bin_deg=None, - degrade_scales=None, - blur_aug=False, - blur_k=None, - brightening=False, - brightness=None, - binarization=False, - image_inversion=False, - channels_shuffling=False, - shuffle_indexes=None, - white_noise_strap=False, - textline_skewing=False, - textline_skewing_bin=False, - skewing_amplitudes=None, - textline_left_in_depth=False, - textline_left_in_depth_bin=False, - textline_right_in_depth=False, - textline_right_in_depth_bin=False, - textline_up_in_depth=False, - textline_up_in_depth_bin=False, - textline_down_in_depth=False, - textline_down_in_depth_bin=False, - pepper_aug=False, - pepper_bin_aug=False, - pepper_indexes=None, - dir_img_bin=None, - add_red_textlines=False, - adding_rgb_background=False, - dir_rgb_backgrounds=None, - adding_rgb_foreground=False, - dir_rgb_foregrounds=None, - number_of_backgrounds_per_image=None, - list_all_possible_background_images=None, - list_all_possible_foreground_rgbs=None, + img, + img_name, + lab, + char_to_num=None, + padding_token=-1, + max_len=500, + n_batch=1, + input_height=None, + input_width=None, + augmentation=False, + color_padding_rotation=None, + thetha_padd=None, + padd_colors=None, + rotation_not_90=None, + thetha=None, + padding_white=None, + white_padds=None, + degrading=False, + bin_deg=None, + degrade_scales=None, + blur_aug=False, + blur_k=None, + brightening=False, + brightness=None, + binarization=False, + image_inversion=False, + channels_shuffling=False, + shuffle_indexes=None, + white_noise_strap=False, + textline_skewing=False, + textline_skewing_bin=False, + skewing_amplitudes=None, + textline_left_in_depth=False, + textline_left_in_depth_bin=False, + textline_right_in_depth=False, + textline_right_in_depth_bin=False, + textline_up_in_depth=False, + textline_up_in_depth_bin=False, + textline_down_in_depth=False, + textline_down_in_depth_bin=False, + pepper_aug=False, + pepper_bin_aug=False, + pepper_indexes=None, + dir_img_bin=None, + add_red_textlines=False, + adding_rgb_background=False, + dir_rgb_backgrounds=None, + adding_rgb_foreground=False, + dir_rgb_foregrounds=None, + number_of_backgrounds_per_image=None, + list_all_possible_background_images=None, + list_all_possible_foreground_rgbs=None, + **kwargs ): def scale_image(img): return scale_padd_image_for_ocr(img, input_height, input_width).astype(np.float32) / 255. From b399db3c00195a7c5df2c9e4b7b20f92a08b8bfa Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 24 Feb 2026 20:43:50 +0100 Subject: [PATCH 099/118] training.models: simplify CTC loss layer --- src/eynollah/training/models.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/eynollah/training/models.py b/src/eynollah/training/models.py index ba61764..4652b07 100644 --- a/src/eynollah/training/models.py +++ b/src/eynollah/training/models.py @@ -29,6 +29,7 @@ from tensorflow.keras.layers import ( ) from tensorflow.keras.models import Model from tensorflow.keras.regularizers import l2 +from tensorflow.keras.backend import ctc_batch_cost from ..patch_encoder import Patches, PatchEncoder @@ -45,10 +46,6 @@ MERGE_AXIS = -1 class CTCLayer(Layer): - def __init__(self, name=None): - super().__init__(name=name) - self.loss_fn = tf.keras.backend.ctc_batch_cost - def call(self, y_true, y_pred): batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64") input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64") @@ -56,7 +53,7 @@ class CTCLayer(Layer): input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64") label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64") - loss = self.loss_fn(y_true, y_pred, input_length, label_length) + loss = ctc_batch_cost(y_true, y_pred, input_length, label_length) self.add_loss(loss) # At test time, just return the computed predictions. @@ -505,6 +502,6 @@ def cnn_rnn_ocr_model(image_height=None, image_width=None, n_classes=None, max_s # Add CTC layer for calculating CTC loss at each step. output = CTCLayer(name="ctc_loss")(labels, out) - model = Model(inputs=[input_img, labels], outputs=output, name="handwriting_recognizer") + model = Model(inputs=(input_img, labels), outputs=output, name="handwriting_recognizer") return model From 36e370aa45b8b9055c97600363a66b8fedc8b0e2 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 25 Feb 2026 00:10:43 +0100 Subject: [PATCH 100/118] training.train: add validation data for OCR --- src/eynollah/training/train.py | 54 +++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 23 deletions(-) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index 6d104dc..df3eac6 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -10,7 +10,7 @@ import tensorflow as tf from tensorflow.keras.optimizers import SGD, Adam from tensorflow.keras.metrics import MeanIoU, F1Score from tensorflow.keras.models import load_model -from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard +from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard, EarlyStopping from tensorflow.keras.layers import StringLookup from tensorflow.keras.utils import image_dataset_from_directory from tensorflow.keras.backend import one_hot @@ -606,12 +606,12 @@ def run(_config, elif task=="cnn-rnn-ocr": - dir_img, dir_lab = get_dirs_or_files(dir_train) - dir_img_val, dir_lab_val = get_dirs_or_files(dir_eval) - imgs_list = list(os.listdir(dir_img)) - labs_list = list(os.listdir(dir_lab)) - imgs_list_val = list(os.listdir(dir_img_val)) - labs_list_val = list(os.listdir(dir_lab_val)) + dir_img_train, dir_lab_train = get_dirs_or_files(dir_train) + dir_img_valdn, dir_lab_valdn = get_dirs_or_files(dir_eval) + imgs_list_train = list(os.listdir(dir_img_train)) + labs_list_train = list(os.listdir(dir_lab_train)) + imgs_list_valdn = list(os.listdir(dir_img_valdn)) + labs_list_valdn = list(os.listdir(dir_lab_valdn)) with open(characters_txt_file, 'r') as char_txt_f: characters = json.load(char_txt_f) @@ -631,20 +631,20 @@ def run(_config, #print(model.summary()) # todo: use Dataset.map() on Dataset.list_files() - # todo: test_ds - def gen(): - return preprocess_imgs(_config, - imgs_list, - labs_list, - dir_img, - dir_lab, - None, # no file I/O, but in-memory - None, # no file I/O, but in-memory - # extra+overrides - char_to_num=char_to_num, - padding_token=padding_token - ) - train_ds = (tf.data.Dataset.from_generator(gen, (tf.float32, tf.int64)) + def get_dataset(imgs_list, labs_list, dir_img, dir_lab): + def gen(): + return preprocess_imgs(_config, + imgs_list, + labs_list, + dir_img, + dir_lab, + None, # no file I/O, but in-memory + None, # no file I/O, but in-memory + # extra+overrides + char_to_num=char_to_num, + padding_token=padding_token + ) + return (tf.data.Dataset.from_generator(gen, (tf.float32, tf.int64)) .padded_batch(n_batch, padded_shapes=([input_height, input_width, 3], [None]), padding_values=(None, tf.constant(padding_token, dtype=tf.int64)), @@ -653,7 +653,15 @@ def run(_config, ) .map(lambda x, y: {"image": x, "label": y}) .prefetch(tf.data.AUTOTUNE) - ) + ) + train_ds = get_dataset(imgs_list_train, + labs_list_train, + dir_img_train, + dir_lab_train) + valdn_ds = get_dataset(imgs_list_valdn, + labs_list_valdn, + dir_img_valdn, + dir_lab_valdn) #initial_learning_rate = 1e-4 #decay_steps = int (n_epochs * ( len_dataset / n_batch )) @@ -669,7 +677,7 @@ def run(_config, callbacks.append(SaveWeightsAfterSteps(save_interval, dir_output, _config)) model.fit( train_ds, - #validation_data=test_ds, + validation_data=valdn_ds, verbose=1, epochs=n_epochs, callbacks=callbacks, From 7823ea2c95e1678cad75c9ab6feb8b68b9c928b8 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 25 Feb 2026 00:16:07 +0100 Subject: [PATCH 101/118] training.train: add early stopping for OCR --- src/eynollah/training/train.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index df3eac6..a3cd1e4 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -672,6 +672,7 @@ def run(_config, model.compile(optimizer=opt) # rs: loss seems to be (ctc_batch_cost) in last layer callbacks = [TensorBoard(os.path.join(dir_output, 'logs'), write_graph=False), + EarlyStopping(verbose=1, patience=3, restore_best_weights=False, start_from_epoch=3), SaveWeightsAfterSteps(0, dir_output, _config)] if save_interval: callbacks.append(SaveWeightsAfterSteps(save_interval, dir_output, _config)) From 4202a1b2db73fde82ca723a912a4baf28ba540d0 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 25 Feb 2026 11:16:21 +0100 Subject: [PATCH 102/118] =?UTF-8?q?training.generate-gt.pagexml2label:=20a?= =?UTF-8?q?dd=20`--missing-printspace`=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - keep default (fallback to full page), but warn - new option `skip` - new option `project` --- .../training/generate_gt_for_training.py | 38 ++++++++++++++----- src/eynollah/training/gt_gen_utils.py | 23 ++++++++++- 2 files changed, 51 insertions(+), 10 deletions(-) diff --git a/src/eynollah/training/generate_gt_for_training.py b/src/eynollah/training/generate_gt_for_training.py index 2422cc2..cc5a1b2 100644 --- a/src/eynollah/training/generate_gt_for_training.py +++ b/src/eynollah/training/generate_gt_for_training.py @@ -35,26 +35,28 @@ def main(): @click.option( "--dir_xml", "-dx", - help="directory of GT page-xml files", + help="input directory of GT PAGE-XML files", type=click.Path(exists=True, file_okay=False), + required=True, ) @click.option( "--dir_images", "-di", - help="directory of org images. If print space cropping or scaling is needed for labels it would be great to provide the original images to apply the same function on them. So if -ps is not set true or in config files no columns_width key is given this argumnet can be ignored. File stems in this directory should be the same as those in dir_xml.", + help="input directory of GT image files (only needed for '--printspace' or scaling configured via 'columns_width'; filename stems should match those in --dir_xml)", type=click.Path(exists=True, file_okay=False), ) @click.option( "--dir_out_images", "-doi", - help="directory where the output org images after undergoing a process (like print space cropping or scaling) will be written.", + help="output directory for training image files (for printspace cropping or scaling)", type=click.Path(exists=True, file_okay=False), ) @click.option( "--dir_out", "-do", - help="directory where ground truth label images would be written", + help="output directory for training label files", type=click.Path(exists=True, file_okay=False), + required=True, ) @click.option( @@ -67,16 +69,25 @@ def main(): @click.option( "--type_output", "-to", - help="this defines how output should be. A 2d image array or a 3d image array encoded with RGB color. Just pass 2d or 3d. The file will be saved one directory up. 2D image array is 3d but only information of one channel would be enough since all channels have the same values.", + type=click.Choice(["2d", "3d"]), + default="2d", + help="generate labels as [H, W] array pseudo index-color images for training ('2d') or [H, W, C] array RGB color images for plotting ('3d')", ) @click.option( "--printspace", "-ps", is_flag=True, - help="if this parameter set to true, generated labels and in the case of provided org images cropping will be imposed and cropped labels and images will be written in output directories.", + help="crop pages from annotated PrintSpace or Border to generate labels and images (will also require -di for so original images so output images are cropped along with labels)", +) +@click.option( + "--missing-printspace", + "-mps", + type=click.Choice(["full", "skip", "project"]), + default="full", + help="if -ps is set, what to do in case a PAGE-XML has no PrintSpace or Border annotation: keep entire page ('full'), ignore file ('skip') or crop artificially from outer hull of all segments ('project')", ) -def pagexml2label(dir_xml,dir_out,type_output,config, printspace, dir_images, dir_out_images): +def pagexml2label(dir_xml, dir_out, type_output, config, printspace, missing_printspace, dir_images, dir_out_images): """ extract PAGE-XML GT data suitable for model training for segmentation tasks """ @@ -86,8 +97,17 @@ def pagexml2label(dir_xml,dir_out,type_output,config, printspace, dir_images, di else: print("passed") config_params = None - gt_list = get_content_of_dir(dir_xml) - get_images_of_ground_truth(gt_list,dir_xml,dir_out,type_output, config, config_params, printspace, dir_images, dir_out_images) + get_images_of_ground_truth(get_content_of_dir(dir_xml), + dir_xml, + dir_out, + type_output, + config, + config_params, + printspace, + missing_printspace, + dir_images, + dir_out_images + ) @main.command() @click.option( diff --git a/src/eynollah/training/gt_gen_utils.py b/src/eynollah/training/gt_gen_utils.py index d5ad4d9..3f1e515 100644 --- a/src/eynollah/training/gt_gen_utils.py +++ b/src/eynollah/training/gt_gen_utils.py @@ -658,7 +658,18 @@ def get_layout_contours_for_visualization(xml_file): co_noise.append(np.array(c_t_in)) return co_text, co_graphic, co_sep, co_img, co_table, co_map, co_noise, y_len, x_len -def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_file, config_params, printspace, dir_images, dir_out_images): +def get_images_of_ground_truth( + gt_list, + dir_in, + output_dir, + output_type, + config_file, + config_params, + printspace, + missing_printspace, + dir_images, + dir_out_images +): """ Reading the page xml files and write the ground truth images into given output directory. """ @@ -702,10 +713,20 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_ if printspace or "printspace_as_class_in_layout" in list(config_params.keys()): ps = (root1.xpath('/pc:PcGts/pc:Page/pc:Border', namespaces=NS) + root1.xpath('/pc:PcGts/pc:Page/pc:PrintSpace', namespaces=NS)) + coords = root1.xpath('//pc:Coords/@points', namespaces=NS) if len(ps): points = ps[0].find('pc:Coords', NS).get('points') ps_bbox = bbox_from_points(points) + elif missing_printspace == 'skip': + print(gt_list[index], "has no Border or PrintSpace - skipping file") + continue + elif missing_printspace == 'project' and len(coords): + print(gt_list[index], "has no Border or PrintSpace - projecting hull of segments") + bboxes = list(map(bbox_from_points, coords)) + left, top, right, bottom = zip(*bboxes) + ps_bbox = [min(left), min(top), max(right), max(bottom)] else: + print(gt_list[index], "has no Border or PrintSpace - using full page") ps_bbox = [0, 0, None, None] From 42bab0f93576f45a44e9d8a10a4866afb0d1407c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 25 Feb 2026 13:18:40 +0100 Subject: [PATCH 103/118] docs/train: document `--missing-printspace=project` --- docs/train.md | 5 +++++ src/eynollah/training/gt_gen_utils.py | 6 +++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/docs/train.md b/docs/train.md index 82bb77c..9c390bb 100644 --- a/docs/train.md +++ b/docs/train.md @@ -271,6 +271,11 @@ eynollah-training generate-gt pagexml2label \ -doi "dir of output cropped images" ``` +Also, note that it can be detrimental to layout training if there are visible segments which +the annotation does not account for (and thus the model must learn to ignore). So if the images +are not cropped, the `-ps` _should_ be used. If a PAGE XML file is missing `PrintSpace` (or `Border`) +annotations, use `-mps` to either `skip` these or `project` (i.e. crop from existing segments). + ## Train a model ### classification diff --git a/src/eynollah/training/gt_gen_utils.py b/src/eynollah/training/gt_gen_utils.py index 3f1e515..796e896 100644 --- a/src/eynollah/training/gt_gen_utils.py +++ b/src/eynollah/training/gt_gen_utils.py @@ -724,7 +724,11 @@ def get_images_of_ground_truth( print(gt_list[index], "has no Border or PrintSpace - projecting hull of segments") bboxes = list(map(bbox_from_points, coords)) left, top, right, bottom = zip(*bboxes) - ps_bbox = [min(left), min(top), max(right), max(bottom)] + left = max(0, min(left) - 5) + top = max(0, min(top) - 5) + right = min(x_len, max(right) + 5) + bottom = min(y_len, max(bottom) + 5) + ps_bbox = [left, top, right, bottom] else: print(gt_list[index], "has no Border or PrintSpace - using full page") ps_bbox = [0, 0, None, None] From b6d2440ce1eca9f8e2b20f030d604ecd63466aeb Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 25 Feb 2026 20:39:15 +0100 Subject: [PATCH 104/118] =?UTF-8?q?training.utils.preprocess=5Fimgs:=20fix?= =?UTF-8?q?=20polymorphy=20in=2027f43c1=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (Functions cannot be both generators and procedures, so make this a pure generator and save the image files on the caller's side; also avoids passing output directories) Moreover, simplify by moving the `os.listdir` into the function body (saving lots of extra variable bindings). --- src/eynollah/training/train.py | 66 ++++++++++++---------------------- src/eynollah/training/utils.py | 37 ++++++++----------- 2 files changed, 36 insertions(+), 67 deletions(-) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index a3cd1e4..74a7a90 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -2,6 +2,7 @@ import os import sys import json +from tqdm import tqdm import requests os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' @@ -422,31 +423,25 @@ def run(_config, os.mkdir(dir_flow_eval_imgs) os.mkdir(dir_flow_eval_labels) - dir_img, dir_seg = get_dirs_or_files(dir_train) - dir_img_val, dir_seg_val = get_dirs_or_files(dir_eval) - - imgs_list = list(os.listdir(dir_img)) - segs_list = list(os.listdir(dir_seg)) - - imgs_list_test = list(os.listdir(dir_img_val)) - segs_list_test = list(os.listdir(dir_seg_val)) - # writing patches into a sub-folder in order to be flowed from directory. - preprocess_imgs(_config, - imgs_list, - segs_list, - dir_img, - dir_seg, - dir_flow_train_imgs, - dir_flow_train_labels) - preprocess_imgs(_config, - imgs_list_test, - segs_list_test, - dir_img_val, - dir_seg_val, - dir_flow_eval_imgs, - dir_flow_eval_labels, - augmentation=False) + def gen(dir_img, dir_lab, dir_flow_imgs, dir_flow_labs, augmentation=True): + indexer = 0 + for img, lab in tqdm(preprocess_imgs(_config, + dir_img, + dir_lab, + augmentation=augmentation), + desc="data_is_provided"): + fname = 'img_%d.png' % indexer + cv2.imwrite(os.path.join(dir_flow_imgs, fname), img) + cv2.imwrite(os.path.join(dir_flow_labs, fname), lab) + indexer += 1 + gen(*get_dirs_or_files(dir_train), + dir_flow_train_imgs, + dir_flow_train_labels) + gen(*get_dirs_or_files(dir_eval), + dir_flow_eval_imgs, + dir_flow_eval_labels, + augmentation=False) if weighted_loss: weights = np.zeros(n_classes) @@ -606,13 +601,6 @@ def run(_config, elif task=="cnn-rnn-ocr": - dir_img_train, dir_lab_train = get_dirs_or_files(dir_train) - dir_img_valdn, dir_lab_valdn = get_dirs_or_files(dir_eval) - imgs_list_train = list(os.listdir(dir_img_train)) - labs_list_train = list(os.listdir(dir_lab_train)) - imgs_list_valdn = list(os.listdir(dir_img_valdn)) - labs_list_valdn = list(os.listdir(dir_lab_valdn)) - with open(characters_txt_file, 'r') as char_txt_f: characters = json.load(char_txt_f) padding_token = len(characters) + 5 @@ -631,15 +619,11 @@ def run(_config, #print(model.summary()) # todo: use Dataset.map() on Dataset.list_files() - def get_dataset(imgs_list, labs_list, dir_img, dir_lab): + def get_dataset(dir_img, dir_lab): def gen(): return preprocess_imgs(_config, - imgs_list, - labs_list, dir_img, dir_lab, - None, # no file I/O, but in-memory - None, # no file I/O, but in-memory # extra+overrides char_to_num=char_to_num, padding_token=padding_token @@ -654,14 +638,8 @@ def run(_config, .map(lambda x, y: {"image": x, "label": y}) .prefetch(tf.data.AUTOTUNE) ) - train_ds = get_dataset(imgs_list_train, - labs_list_train, - dir_img_train, - dir_lab_train) - valdn_ds = get_dataset(imgs_list_valdn, - labs_list_valdn, - dir_img_valdn, - dir_lab_valdn) + train_ds = get_dataset(*get_dirs_or_files(dir_train)) + valdn_ds = get_dataset(*get_dirs_or_files(dir_eval)) #initial_learning_rate = 1e-4 #decay_steps = int (n_epochs * ( len_dataset / n_batch )) diff --git a/src/eynollah/training/utils.py b/src/eynollah/training/utils.py index 02a1ca5..33a1fd2 100644 --- a/src/eynollah/training/utils.py +++ b/src/eynollah/training/utils.py @@ -9,7 +9,6 @@ import numpy as np import seaborn as sns from scipy.ndimage.interpolation import map_coordinates from scipy.ndimage.filters import gaussian_filter -from tqdm import tqdm import imutils import tensorflow as tf @@ -753,17 +752,11 @@ def get_patches_num_scale_new(img, label, height, width, scaler=1.0): yield img_patch, label_patch -# TODO: refactor to combine with data_gen_ocr def preprocess_imgs(config, - imgs_list, - labs_list, dir_img, dir_lab, - dir_flow_imgs, - dir_flow_lbls, logger=None, - **kwargs, -): + **kwargs): if logger is None: logger = getLogger('') @@ -779,14 +772,16 @@ def preprocess_imgs(config, # override keys from call config.update(kwargs) + imgs_list = list(sorted(os.listdir(dir_img))) + labs_list = list(sorted(os.listdir(dir_lab))) + seed = random.getstate() random.shuffle(imgs_list) random.setstate(seed) random.shuffle(labs_list) # labs_list not used because stem matching more robust - indexer = 0 - for img, lab in tqdm(zip(imgs_list, labs_list)): + for img, lab in zip(imgs_list, labs_list): img_name = os.path.splitext(img)[0] img = cv2.imread(os.path.join(dir_img, img)) if config['task'] in ["segmentation", "binarization"]: @@ -803,20 +798,16 @@ def preprocess_imgs(config, try: if config['task'] == "cnn-rnn-ocr": - yield from preprocess_img_ocr(img, img_name, lab, - **config) + yield from preprocess_img_ocr(img, img_name, lab, **config) continue - for img, lab in preprocess_img(img, img_name, lab, - **config): - cv2.imwrite(os.path.join(dir_flow_imgs, '/img_%d.png' % indexer), - resize_image(img, - config['input_height'], - config['input_width'])) - cv2.imwrite(os.path.join(dir_flow_lbls, '/img_%d.png' % indexer), - resize_image(lab, - config['input_height'], - config['input_width'])) - indexer += 1 + else: + for img, lab in preprocess_img(img, img_name, lab, **config): + yield (resize_image(img, + config['input_height'], + config['input_width']), + resize_image(lab, + config['input_height'], + config['input_width'])) except: logger.exception("skipping image %s", img_name) From 439ca350ddf91f20223d4034b4a6c1dea2a4bf2f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 26 Feb 2026 13:55:37 +0100 Subject: [PATCH 105/118] training: add metric ConfusionMatrix and plot it to TensorBoard --- src/eynollah/training/metrics.py | 51 +++++++++++++++++- src/eynollah/training/train.py | 90 +++++++++++++++++++++++++++++--- 2 files changed, 132 insertions(+), 9 deletions(-) diff --git a/src/eynollah/training/metrics.py b/src/eynollah/training/metrics.py index a8f47d7..56dc732 100644 --- a/src/eynollah/training/metrics.py +++ b/src/eynollah/training/metrics.py @@ -1,5 +1,10 @@ -from tensorflow.keras import backend as K +import os + +os.environ['TF_USE_LEGACY_KERAS'] = '1' # avoid Keras 3 after TF 2.15 import tensorflow as tf +from tensorflow.keras import backend as K +from tensorflow.keras.metrics import Metric +from tensorflow.keras.initializers import Zeros import numpy as np @@ -361,3 +366,47 @@ def jaccard_distance_loss(y_true, y_pred, smooth=100): sum_ = K.sum(K.abs(y_true) + K.abs(y_pred), axis=-1) jac = (intersection + smooth) / (sum_ - intersection + smooth) return (1 - jac) * smooth + + +class ConfusionMatrix(Metric): + def __init__(self, nlabels=None, nrm="all", name="confusion_matrix", dtype=tf.float32): + super().__init__(name=name, dtype=dtype) + assert nlabels is not None + self._nlabels = nlabels + self._shape = (self._nlabels, self._nlabels) + self._matrix = self.add_weight(name, shape=self._shape, + initializer=Zeros) + assert nrm in ("all", "true", "pred", "none") + self._nrm = nrm + + def update_state(self, y_true, y_pred, sample_weight=None): + y_pred = tf.math.argmax(y_pred, axis=-1) + y_true = tf.math.argmax(y_true, axis=-1) + + y_pred = tf.reshape(y_pred, shape=(-1,)) + y_true = tf.reshape(y_true, shape=(-1,)) + + y_pred.shape.assert_is_compatible_with(y_true.shape) + confusion = tf.math.confusion_matrix(y_true, y_pred, num_classes=self._nlabels, dtype=self._dtype) + + return self._matrix.assign_add(confusion) + + def result(self): + """normalize""" + if self._nrm == "all": + denom = tf.math.reduce_sum(self._matrix, axis=(0, 1)) + elif self._nrm == "true": + denom = tf.math.reduce_sum(self._matrix, axis=1, keepdims=True) + elif self._nrm == "pred": + denom = tf.math.reduce_sum(self._matrix, axis=0, keepdims=True) + else: + denom = tf.constant(1.0) + return tf.math.divide_no_nan(self._matrix, denom) + + def reset_state(self): + for v in self.variables: + v.assign(tf.zeros(shape=self._shape)) + + def get_config(self): + return dict(nlabels=self._nlabels, + **super().get_config()) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index 74a7a90..0c624c3 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -1,5 +1,6 @@ import os import sys +import io import json from tqdm import tqdm @@ -21,10 +22,12 @@ from sacred.config import create_captured_function import numpy as np import cv2 +from matplotlib import pyplot as plt # for plot_confusion_matrix from .metrics import ( soft_dice_loss, - weighted_categorical_crossentropy + weighted_categorical_crossentropy, + ConfusionMatrix, ) from .models import ( PatchEncoder, @@ -151,6 +154,45 @@ def plot_layout_tf(in_: tf.Tensor, out:tf.Tensor) -> tf.Tensor: weighted = image * 0.9 + layout * 0.1 return tf.cast(weighted, tf.uint8) +def plot_confusion_matrix(cm, name="Confusion Matrix"): + """ + Plot the confusion matrix with matplotlib and tensorflow + """ + size = cm.shape[0] + fig, ax = plt.subplots() + im = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues) + ax.figure.colorbar(im, ax=ax) + ax.set(xticks=np.arange(cm.shape[1]), + yticks=np.arange(cm.shape[0]), + xlim=[-0.5, cm.shape[1] - 0.5], + ylim=[-0.5, cm.shape[0] - 0.5], + #xticklabels=labels, + #yticklabels=labels, + title=name, + ylabel='True class', + xlabel='Predicted class') + # Rotate the tick labels and set their alignment. + plt.setp(ax.get_xticklabels(), rotation=45, ha="right", + rotation_mode="anchor") + # Loop over data dimensions and create text annotations. + thresh = cm.max() / 2. + for i in range(cm.shape[0]): + for j in range(cm.shape[1]): + ax.text(j, i, format(cm[i, j], ".2f"), + ha="center", va="center", + color="white" if cm[i, j] > thresh else "black") + fig.tight_layout() + # convert to PNG + buf = io.BytesIO() + fig.savefig(buf, format='png') + plt.close(fig) + buf.seek(0) + # Convert PNG buffer to TF image + image = tf.image.decode_png(buf.getvalue(), channels=4) + # Add the batch dimension + image = tf.expand_dims(image, 0) + return image + # plot predictions on train and test set during every epoch class TensorBoardPlotter(TensorBoard): def __init__(self, *args, **kwargs): @@ -185,6 +227,36 @@ class TensorBoardPlotter(TensorBoard): # used to be family kwarg for tf.summary.image name prefix with tf.name_scope(family): tf.summary.image(mode, images, step=step, max_outputs=len(images)) + def on_train_batch_end(self, batch, logs=None): + if logs is not None: + logs = dict(logs) + # cannot be logged as scalar: + logs.pop('confusion_matrix', None) + super().on_train_batch_end(batch, logs) + def on_test_end(self, logs=None): + if logs is not None: + logs = dict(logs) + # cannot be logged as scalar: + logs.pop('confusion_matrix', None) + super().on_test_end(logs) + def _log_epoch_metrics(self, epoch, logs): + if not logs: + return + logs = dict(logs) + # cannot be logged as scalar: + train_matrix = logs.pop('confusion_matrix', None) + val_matrix = logs.pop('val_confusion_matrix', None) + super()._log_epoch_metrics(epoch, logs) + # now plot confusion_matrix + with tf.summary.record_if(True): + if train_matrix is not None: + train_image = plot_confusion_matrix(train_matrix) + with self._train_writer.as_default(): + tf.summary.image("confusion_matrix", train_image, step=epoch) + if val_matrix is not None: + val_image = plot_confusion_matrix(val_matrix) + with self._val_writer.as_default(): + tf.summary.image("confusion_matrix", val_image, step=epoch) def get_dirs_or_files(input_data): image_input, labels_input = os.path.join(input_data, 'images/'), os.path.join(input_data, 'labels/') @@ -523,6 +595,7 @@ def run(_config, #if you want to see the model structure just uncomment model summary. #model.summary() + metrics = ['categorical_accuracy'] if task in ["segmentation", "binarization"]: if is_loss_soft_dice: loss = soft_dice_loss @@ -530,17 +603,18 @@ def run(_config, loss = weighted_categorical_crossentropy(weights) else: loss = 'categorical_crossentropy' + metrics.append(num_connected_components_regression(0.1)) + metrics.append(MeanIoU(n_classes, + name='iou', + ignore_class=0, + sparse_y_true=False, + sparse_y_pred=False)) + metrics.append(ConfusionMatrix(n_classes)) else: # task == "enhancement" loss = 'mean_squared_error' model.compile(loss=loss, optimizer=Adam(learning_rate=learning_rate), - metrics=['accuracy', - num_connected_components_regression(0.1), - MeanIoU(n_classes, - name='iou', - ignore_class=0, - sparse_y_true=False, - sparse_y_pred=False)]) + metrics=metrics) def _to_cv2float(img): # rgb→bgr and uint8→float, as expected by Eynollah models From 7c3aeda65e85db8162ddd733c533904f029bc03d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 27 Feb 2026 12:40:56 +0100 Subject: [PATCH 106/118] training.models: fix 9b66867c --- src/eynollah/training/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/eynollah/training/models.py b/src/eynollah/training/models.py index 4652b07..d0b24c0 100644 --- a/src/eynollah/training/models.py +++ b/src/eynollah/training/models.py @@ -334,7 +334,7 @@ def vit_resnet50_unet(num_patches, transformer_mlp_head_units = [128, 64] inputs = Input(shape=(input_height, input_width, 3)) - features = resnet50(inputs, weight_decay=weight_decay, pretraining=pretraining) + features = list(resnet50(inputs, weight_decay=weight_decay, pretraining=pretraining)) features[-1] = transformer_block(features[-1], num_patches, From ba954d6314ec1720eb127823f5ac6de45086fa5b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 27 Feb 2026 12:47:59 +0100 Subject: [PATCH 107/118] training.models: fix daa084c3 --- src/eynollah/training/models.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/eynollah/training/models.py b/src/eynollah/training/models.py index d0b24c0..13a35a1 100644 --- a/src/eynollah/training/models.py +++ b/src/eynollah/training/models.py @@ -345,9 +345,7 @@ def vit_resnet50_unet(num_patches, transformer_num_heads, transformer_projection_dim) - o = unet_decoder(inputs, *features, n_classes, task=task, weight_decay=weight_decay) - - return Model(inputs, o) + return unet_decoder(inputs, *features, n_classes, task=task, weight_decay=weight_decay) def vit_resnet50_unet_transformer_before_cnn(num_patches, n_classes, @@ -380,9 +378,7 @@ def vit_resnet50_unet_transformer_before_cnn(num_patches, features = resnet50(encoded_patches, weight_decay=weight_decay, pretraining=pretraining) - o = unet_decoder(inputs, *features, n_classes, task=task, weight_decay=weight_decay) - - return Model(inputs, o) + return unet_decoder(inputs, *features, n_classes, task=task, weight_decay=weight_decay) def resnet50_classifier(n_classes,input_height=224,input_width=224,weight_decay=1e-6,pretraining=False): include_top=True From 2d5de8e5957d5ab6540cad7bd350f6b99ca49cc5 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 27 Feb 2026 12:48:28 +0100 Subject: [PATCH 108/118] =?UTF-8?q?training.models:=20use=20bilinear=20ins?= =?UTF-8?q?tead=20of=20nearest=20upsampling=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (to benefit from CUDA optimization) --- src/eynollah/training/models.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/eynollah/training/models.py b/src/eynollah/training/models.py index 13a35a1..a95ba7e 100644 --- a/src/eynollah/training/models.py +++ b/src/eynollah/training/models.py @@ -212,7 +212,7 @@ def unet_decoder(img, f1, f2, f3, f4, f5, n_classes, light=False, task="segmenta f4 = BatchNormalization(axis=bn_axis)(f4) f4 = Activation('relu')(f4) - o = UpSampling2D((2, 2), data_format=IMAGE_ORDERING)(o) + o = UpSampling2D((2, 2), data_format=IMAGE_ORDERING, interpolation="bilinear")(o) o = concatenate([o, f4], axis=MERGE_AXIS) o = ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING)(o) o = Conv2D(512, (3, 3), padding='valid', @@ -220,7 +220,7 @@ def unet_decoder(img, f1, f2, f3, f4, f5, n_classes, light=False, task="segmenta o = BatchNormalization(axis=bn_axis)(o) o = Activation('relu')(o) - o = UpSampling2D((2, 2), data_format=IMAGE_ORDERING)(o) + o = UpSampling2D((2, 2), data_format=IMAGE_ORDERING, interpolation="bilinear")(o) o = concatenate([o, f3], axis=MERGE_AXIS) o = ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING)(o) o = Conv2D(256, (3, 3), padding='valid', @@ -228,7 +228,7 @@ def unet_decoder(img, f1, f2, f3, f4, f5, n_classes, light=False, task="segmenta o = BatchNormalization(axis=bn_axis)(o) o = Activation('relu')(o) - o = UpSampling2D((2, 2), data_format=IMAGE_ORDERING)(o) + o = UpSampling2D((2, 2), data_format=IMAGE_ORDERING, interpolation="bilinear")(o) o = concatenate([o, f2], axis=MERGE_AXIS) o = ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING)(o) o = Conv2D(128, (3, 3), padding='valid', @@ -236,7 +236,7 @@ def unet_decoder(img, f1, f2, f3, f4, f5, n_classes, light=False, task="segmenta o = BatchNormalization(axis=bn_axis)(o) o = Activation('relu')(o) - o = UpSampling2D((2, 2), data_format=IMAGE_ORDERING)(o) + o = UpSampling2D((2, 2), data_format=IMAGE_ORDERING, interpolation="bilinear")(o) o = concatenate([o, f1], axis=MERGE_AXIS) o = ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING)(o) o = Conv2D(64, (3, 3), padding='valid', @@ -244,7 +244,7 @@ def unet_decoder(img, f1, f2, f3, f4, f5, n_classes, light=False, task="segmenta o = BatchNormalization(axis=bn_axis)(o) o = Activation('relu')(o) - o = UpSampling2D((2, 2), data_format=IMAGE_ORDERING)(o) + o = UpSampling2D((2, 2), data_format=IMAGE_ORDERING, interpolation="bilinear")(o) o = concatenate([o, img], axis=MERGE_AXIS) o = ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING)(o) o = Conv2D(32, (3, 3), padding='valid', From f8dd5a328c130a82d6d06dec70a162937e78a729 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 27 Feb 2026 12:50:37 +0100 Subject: [PATCH 109/118] =?UTF-8?q?training:=20make=20plotting=2018607e0f?= =?UTF-8?q?=20more=20efficient=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - avoid control dependencies in model path - store only every 3rd sample --- src/eynollah/training/train.py | 39 +++++++++++++++++----------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index 0c624c3..30e30cb 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -115,7 +115,6 @@ def num_connected_components_regression(alpha: float): metric.__name__ = 'nCC' return metric -@tf.function def plot_layout_tf(in_: tf.Tensor, out:tf.Tensor) -> tf.Tensor: """ Implements training.inference.SBBPredict.visualize_model_output for TF @@ -158,9 +157,8 @@ def plot_confusion_matrix(cm, name="Confusion Matrix"): """ Plot the confusion matrix with matplotlib and tensorflow """ - size = cm.shape[0] - fig, ax = plt.subplots() - im = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues) + fig, ax = plt.subplots(figsize=(10, 8), dpi=300) + im = ax.imshow(cm, vmin=0.0, vmax=1.0, interpolation='nearest', cmap=plt.cm.Blues) ax.figure.colorbar(im, ax=ax) ax.set(xticks=np.arange(cm.shape[1]), yticks=np.arange(cm.shape[0]), @@ -171,9 +169,6 @@ def plot_confusion_matrix(cm, name="Confusion Matrix"): title=name, ylabel='True class', xlabel='Predicted class') - # Rotate the tick labels and set their alignment. - plt.setp(ax.get_xticklabels(), rotation=45, ha="right", - rotation_mode="anchor") # Loop over data dimensions and create text annotations. thresh = cm.max() / 2. for i in range(cm.shape[0]): @@ -200,33 +195,39 @@ class TensorBoardPlotter(TensorBoard): self.model_call = None def on_epoch_begin(self, epoch, logs=None): super().on_epoch_begin(epoch, logs=logs) + # override the model's call(), so we don't have to invest extra cycles + # to predict our samples (plotting itself can be neglected) self.model_call = self.model.call - @tf.function def new_call(inputs, **kwargs): outputs = self.model_call(inputs, **kwargs) images = plot_layout_tf(inputs, outputs) self.plot(images, training=kwargs.get('training', None), epoch=epoch) - return outputs + with tf.control_dependencies(None): + return outputs self.model.call = new_call - def on_epoch_end(self, epoch, logs=None): - # re-instate (so ModelCheckpoint does not see our override call) - self.model.call = self.model_call # force rebuild of tf.function (so Python binding for epoch gets re-evaluated) self.model.train_function = self.model.make_train_function(True) self.model.test_function = self.model.make_test_function(True) + def on_epoch_end(self, epoch, logs=None): + # re-instate (so ModelCheckpoint does not see our override call) + self.model.call = self.model_call super().on_epoch_end(epoch, logs=logs) def plot(self, images, training=None, epoch=0): if training: writer = self._train_writer - mode, step = "train", self._train_step.read_value() + mode, step = "train", self._train_step.value() else: writer = self._val_writer - mode, step = "test", self._val_step.read_value() - family = "epoch_%03d" % (1 + epoch) - with writer.as_default(): - # used to be family kwarg for tf.summary.image name prefix - with tf.name_scope(family): - tf.summary.image(mode, images, step=step, max_outputs=len(images)) + mode, step = "test", self._val_step.value() + # skip most samples, because TF's EncodePNG is so costly, + # and now ends up in the middle of our pipeline, thus causing stalls + # (cannot use max_outputs, as batch size may be too small) + if not tf.cast(step % 3, tf.bool): + with writer.as_default(): + # used to be family kwarg for tf.summary.image name prefix + family = "epoch_%03d/" % (1 + epoch) + name = family + mode + tf.summary.image(name, images, step=step, max_outputs=len(images)) def on_train_batch_end(self, batch, logs=None): if logs is not None: logs = dict(logs) From 1cff937e72154a6440819f031a631ecacda16a39 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 27 Feb 2026 12:53:09 +0100 Subject: [PATCH 110/118] training: make data pipeline in 7888fa5 more efficient --- src/eynollah/training/train.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index 30e30cb..92a2f49 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -633,7 +633,7 @@ def run(_config, def get_dataset(dir_imgs, dir_labs, shuffle=None): gen_kwargs = dict(labels=None, label_mode=None, - batch_size=1, # batch after zip below + batch_size=None, # batch after zip below image_size=(input_height, input_width), color_mode='rgb', shuffle=shuffle is not None, @@ -647,11 +647,12 @@ def run(_config, ) img_gen = image_dataset_from_directory(dir_imgs, **gen_kwargs) lab_gen = image_dataset_from_directory(dir_labs, **gen_kwargs) - img_gen = img_gen.map(_to_cv2float) - lab_gen = lab_gen.map(_to_cv2float) + img_gen = img_gen.map(_to_cv2float, num_parallel_calls=tf.data.AUTOTUNE) + lab_gen = lab_gen.map(_to_cv2float, num_parallel_calls=tf.data.AUTOTUNE) if task in ["segmentation", "binarization"]: - lab_gen = lab_gen.map(_to_categorical) - return tf.data.Dataset.zip(img_gen, lab_gen).rebatch(n_batch, drop_remainder=True) + lab_gen = lab_gen.map(_to_categorical, num_parallel_calls=tf.data.AUTOTUNE) + ds = tf.data.Dataset.zip(img_gen, lab_gen) + return ds.batch(n_batch, drop_remainder=True, num_parallel_calls=tf.data.AUTOTUNE) train_gen = get_dataset(dir_flow_train_imgs, dir_flow_train_labels, shuffle=np.random.randint(1e6)) valdn_gen = get_dataset(dir_flow_eval_imgs, dir_flow_eval_labels) train_steps = len(os.listdir(dir_flow_train_imgs)) // n_batch From c1d8a72edc3125159396ccca6db45ff8a69c06de Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 28 Feb 2026 20:04:32 +0100 Subject: [PATCH 111/118] training: shuffle tf.data pipelines --- src/eynollah/training/train.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index 92a2f49..63f7717 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -660,11 +660,13 @@ def run(_config, _log.info("training on %d batches in %d epochs", train_steps, n_epochs) _log.info("validating on %d batches", valdn_steps) - callbacks = [TensorBoardPlotter(os.path.join(dir_output, 'logs'), write_graph=False), + callbacks = [TensorBoardPlotter(os.path.join(dir_output, 'logs'), profile_batch=(10, 20)), SaveWeightsAfterSteps(0, dir_output, _config), ] if save_interval: callbacks.append(SaveWeightsAfterSteps(save_interval, dir_output, _config)) + train_gen = train_gen.shuffle(train_steps // 1000, reshuffle_each_iteration=True) + valdn_gen = valdn_gen.shuffle(valdn_steps // 10, reshuffle_each_iteration=False) model.fit( train_gen.prefetch(tf.data.AUTOTUNE), steps_per_epoch=train_steps, @@ -731,7 +733,7 @@ def run(_config, if save_interval: callbacks.append(SaveWeightsAfterSteps(save_interval, dir_output, _config)) model.fit( - train_ds, + train_ds.shuffle(200), validation_data=valdn_ds, verbose=1, epochs=n_epochs, From c6d9dd7945e745ed5fde140982d08e1fb7e15c39 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 27 Feb 2026 12:57:47 +0100 Subject: [PATCH 112/118] training: use mixed precision and XLA (commented; does not work, yet) --- src/eynollah/training/metrics.py | 7 ++++--- src/eynollah/training/train.py | 3 +++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/eynollah/training/metrics.py b/src/eynollah/training/metrics.py index 56dc732..5955888 100644 --- a/src/eynollah/training/metrics.py +++ b/src/eynollah/training/metrics.py @@ -8,7 +8,9 @@ from tensorflow.keras.initializers import Zeros import numpy as np -def focal_loss(gamma=2., alpha=4.): +EPS = K.epsilon() + +def focal_loss(gamma=2., alpha=4., epsilon=EPS): gamma = float(gamma) alpha = float(alpha) @@ -32,7 +34,6 @@ def focal_loss(gamma=2., alpha=4.): Returns: [tensor] -- loss. """ - epsilon = 1.e-9 y_true = tf.convert_to_tensor(y_true, tf.float32) y_pred = tf.convert_to_tensor(y_pred, tf.float32) @@ -153,7 +154,7 @@ def generalized_dice_loss(y_true, y_pred): # TODO: document where this is from -def soft_dice_loss(y_true, y_pred, epsilon=1e-6): +def soft_dice_loss(y_true, y_pred, epsilon=EPS): """ Soft dice loss calculation for arbitrary batch size, number of classes, and number of spatial dimensions. Assumes the `channels_last` format. diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index 63f7717..4d997e5 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -76,6 +76,8 @@ def configuration(): try: for device in tf.config.list_physical_devices('GPU'): tf.config.experimental.set_memory_growth(device, True) + #tf.keras.mixed_precision.set_global_policy('mixed_float16') + #tf.keras.backend.set_epsilon(1e-4) # avoid NaN from smaller defaults except: print("no GPU device available", file=sys.stderr) @@ -614,6 +616,7 @@ def run(_config, else: # task == "enhancement" loss = 'mean_squared_error' model.compile(loss=loss, + #jit_compile=True, optimizer=Adam(learning_rate=learning_rate), metrics=metrics) From 7e06ab2c8cec8b9db01062f6c2fae954768f553b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 27 Feb 2026 12:55:15 +0100 Subject: [PATCH 113/118] =?UTF-8?q?training:=20add=20config=20param=20add?= =?UTF-8?q?=5Fncc=5Floss=20for=20layout/binarization=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - add `metrics.metrics_superposition` and `metrics.Superposition` - if non-zero, mix configured loss with weighted nCC metric --- src/eynollah/training/metrics.py | 30 +++++++++++++++++++++++++++++- src/eynollah/training/train.py | 9 ++++++++- 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/src/eynollah/training/metrics.py b/src/eynollah/training/metrics.py index 5955888..60ac421 100644 --- a/src/eynollah/training/metrics.py +++ b/src/eynollah/training/metrics.py @@ -3,7 +3,7 @@ import os os.environ['TF_USE_LEGACY_KERAS'] = '1' # avoid Keras 3 after TF 2.15 import tensorflow as tf from tensorflow.keras import backend as K -from tensorflow.keras.metrics import Metric +from tensorflow.keras.metrics import Metric, MeanMetricWrapper, get from tensorflow.keras.initializers import Zeros import numpy as np @@ -369,6 +369,34 @@ def jaccard_distance_loss(y_true, y_pred, smooth=100): return (1 - jac) * smooth +def metrics_superposition(*metrics, weights=None): + """ + return a single metric derived by adding all given metrics + + default weights are uniform + """ + if weights is None: + weights = len(metrics) * [tf.constant(1.0)] + def mixed(y_true, y_pred): + results = [] + for metric, weight in zip(metrics, weights): + results.append(metric(y_true, y_pred) * weight) + return tf.reduce_mean(tf.stack(results), 0) + mixed.__name__ = '/'.join(m.__name__ for m in metrics) + return mixed + + +class Superposition(MeanMetricWrapper): + def __init__(self, metrics, weights=None, dtype=None): + self._metrics = metrics + self._weights = weights + mixed = metrics_superposition(*metrics, weights=weights) + super().__init__(mixed, name=mixed.__name__, dtype=dtype) + def get_config(self): + return dict(metrics=self._metrics, + weights=self._weights, + **super().get_config()) + class ConfusionMatrix(Metric): def __init__(self, nlabels=None, nrm="all", name="confusion_matrix", dtype=tf.float32): super().__init__(name=name, dtype=dtype) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index 4d997e5..a12b9c7 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -27,6 +27,8 @@ from matplotlib import pyplot as plt # for plot_confusion_matrix from .metrics import ( soft_dice_loss, weighted_categorical_crossentropy, + get as get_metric, + metrics_superposition, ConfusionMatrix, ) from .models import ( @@ -306,6 +308,7 @@ def config_params(): if task in ["segmentation", "binarization"]: is_loss_soft_dice = False # Use soft dice as loss function. When set to true, "weighted_loss" must be false. weighted_loss = False # Use weighted categorical cross entropy as loss fucntion. When set to true, "is_loss_soft_dice" must be false. + add_ncc_loss = 0 # Add regression loss for number of connected components. When non-zero, use this as weight for the nCC term. elif task == "classification": f1_threshold_classification = None # This threshold is used to consider models with an evaluation f1 scores bigger than it. The selected model weights undergo a weights ensembling. And avreage ensembled model will be written to output. classification_classes_name = None # Dictionary of classification classes names. @@ -416,6 +419,7 @@ def run(_config, thetha=None, is_loss_soft_dice=False, weighted_loss=False, + add_ncc_loss=None, ## if continue_training index_start=0, dir_of_start_model=None, @@ -605,7 +609,10 @@ def run(_config, elif weighted_loss: loss = weighted_categorical_crossentropy(weights) else: - loss = 'categorical_crossentropy' + loss = get_metric('categorical_crossentropy') + if add_ncc_loss: + loss = metrics_superposition(loss, num_connected_components_regression(0.1), + weights=[1 - add_ncc_loss, add_ncc_loss]) metrics.append(num_connected_components_regression(0.1)) metrics.append(MeanIoU(n_classes, name='iou', From 361d40c064d4201a3ecefccab00cf08ee95e1013 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 28 Feb 2026 19:44:10 +0100 Subject: [PATCH 114/118] =?UTF-8?q?training:=20improve=20nCC=20metric/loss?= =?UTF-8?q?=20-=20measure=20localized=20congruence=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - instead of just comparing the number of connected components, calculate the GT/pred label incidence matrix and retrieve the share of singular values (i.e. nearly diagonal under reordering) over total counts as similarity score - also, suppress artificial class in that --- src/eynollah/training/train.py | 86 ++++++++++++++++++++++++++-------- 1 file changed, 67 insertions(+), 19 deletions(-) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index a12b9c7..efaa96e 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -83,7 +83,7 @@ def configuration(): except: print("no GPU device available", file=sys.stderr) -def num_connected_components_regression(alpha: float): +def num_connected_components_regression(artificial=0): """ metric/loss function capturing the separability of segmentation maps @@ -92,29 +92,77 @@ def num_connected_components_regression(alpha: float): 2. the connected components (i.e. the instance label map) 3. the max() (i.e. the highest label = nr of components) - Then calculates a regression formula between those two targets: - - overall mean squared (to incentivise exact fit) - - additive component (to incentivise more over less segments; - this prevents neighbours of spilling into each other; - oversegmentation is usually not as bad as undersegmentation) + The original idea was to then calculate a regression formula + between those two targets. But it is insufficient to just + approximate the same number of components, for they might be + completely different (true components being merged, predicted + components splitting others). We really want to capture the + correspondence between those labels, which is localised. + + For that we now calculate the label pairs and their counts. + Looking at the M,N incidence matrix, we want those counts + to be distributed orthogonally (ideally). So we compute a + singular value decomposition and compare the sum total of + singular values to the sum total of all label counts. The + rate of the two determines a measure of congruence. + + Moreover, for the case of artificial boundary segments around + regions, optionally introduced by the training extractor to + represent segment identity in the loss (and removed at runtime): + Reduce this class to background as well. """ def metric(y_true, y_pred): + if artificial: + # convert artificial border class to background + y_true = y_true[:, :, :, :artificial] + y_pred = y_pred[:, :, :, :artificial] # [B, H, W, C] l_true = tf.math.argmax(y_true, axis=-1) l_pred = tf.math.argmax(y_pred, axis=-1) # [B, H, W] - c_true = connected_components(l_true) - c_pred = connected_components(l_pred) + c_true = tf.cast(connected_components(l_true), tf.int64) + c_pred = tf.cast(connected_components(l_pred), tf.int64) # [B, H, W] - n_batch = tf.shape(y_true)[0] - c_true = tf.reshape(c_true, (n_batch, -1)) - c_pred = tf.reshape(c_pred, (n_batch, -1)) - # [B, H*W] - n_true = tf.math.reduce_max(c_true, axis=1) - n_pred = tf.math.reduce_max(c_pred, axis=1) - # [B] - diff = tf.cast(n_true - n_pred, tf.float32) - return tf.reduce_mean(tf.math.sqrt(tf.math.square(diff) + alpha * diff), axis=-1) + #n_batch = tf.shape(y_true)[0] + n_batch = y_true.shape[0] + C_true = tf.math.reduce_max(c_true, (1, 2)) + 1 + C_pred = tf.math.reduce_max(c_pred, (1, 2)) + 1 + MODULUS = tf.constant(2**22, tf.int64) + tf.debugging.assert_less(C_true, MODULUS, + message="cannot compare segments: too many connected components in GT") + tf.debugging.assert_less(C_pred, MODULUS, + message="cannot compare segments: too many connected components in prediction") + c_comb = MODULUS * c_pred + c_true + tf.debugging.assert_greater_equal(c_comb, tf.constant(0, tf.int64), + message="overflow pairing components") + # [B, H, W] + # tf.unique does not support batch dim, so... + results = [] + for c_comb, C_true, C_pred in zip( + tf.unstack(c_comb, num=n_batch), + tf.unstack(C_true, num=n_batch), + tf.unstack(C_pred, num=n_batch), + ): + prod, _, count = tf.unique_with_counts(tf.reshape(c_comb, (-1,))) + #tf.print(n_batch, tf.shape(prod), C_true, C_true) + # [L] + #corr = tf.zeros([C_pred, C_true], tf.int32) + #corr[prod // 2**24, prod % 2**24] = count + corr = tf.scatter_nd(tf.stack([prod // MODULUS, prod % MODULUS], axis=1), + count, (C_pred, C_true)) + corr = tf.cast(corr, tf.float32) + # [Cpred, Ctrue] + sgv = tf.linalg.svd(corr, compute_uv=False) + results.append(tf.reduce_sum(sgv) / tf.reduce_sum(corr)) + return 1.0 - tf.reduce_mean(tf.stack(results), 0) + # c_true = tf.reshape(c_true, (n_batch, -1)) + # c_pred = tf.reshape(c_pred, (n_batch, -1)) + # # [B, H*W] + # n_true = tf.math.reduce_max(c_true, axis=1) + # n_pred = tf.math.reduce_max(c_pred, axis=1) + # # [B] + # diff = tf.cast(n_true - n_pred, tf.float32) + # return tf.reduce_mean(tf.math.abs(diff) + alpha * diff, axis=-1) metric.__name__ = 'nCC' return metric @@ -611,9 +659,9 @@ def run(_config, else: loss = get_metric('categorical_crossentropy') if add_ncc_loss: - loss = metrics_superposition(loss, num_connected_components_regression(0.1), + loss = metrics_superposition(loss, num_connected_components_regression(n_classes - 1), weights=[1 - add_ncc_loss, add_ncc_loss]) - metrics.append(num_connected_components_regression(0.1)) + metrics.append(num_connected_components_regression(n_classes - 1)) metrics.append(MeanIoU(n_classes, name='iou', ignore_class=0, From e47653f684eb82183a56c671d96ba48e89fb0c29 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 28 Feb 2026 20:01:49 +0100 Subject: [PATCH 115/118] =?UTF-8?q?training:=20move=20nCC=20metric/loss=20?= =?UTF-8?q?to=20.metrics=20and=20rename=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - `num_connected_components_regression` → `connected_components_loss` - move from training.train to training.metrics --- src/eynollah/training/metrics.py | 85 ++++++++++++++++++++++++++++++ src/eynollah/training/train.py | 90 ++------------------------------ 2 files changed, 88 insertions(+), 87 deletions(-) diff --git a/src/eynollah/training/metrics.py b/src/eynollah/training/metrics.py index 60ac421..caa0e65 100644 --- a/src/eynollah/training/metrics.py +++ b/src/eynollah/training/metrics.py @@ -5,6 +5,7 @@ import tensorflow as tf from tensorflow.keras import backend as K from tensorflow.keras.metrics import Metric, MeanMetricWrapper, get from tensorflow.keras.initializers import Zeros +from tensorflow_addons.image import connected_components import numpy as np @@ -439,3 +440,87 @@ class ConfusionMatrix(Metric): def get_config(self): return dict(nlabels=self._nlabels, **super().get_config()) + +def connected_components_loss(artificial=0): + """ + metric/loss function capturing the separability of segmentation maps + + For both sides (true and predicted, resp.), computes + 1. the argmax() of class-wise softmax input (i.e. the segmentation map) + 2. the connected components (i.e. the instance label map) + 3. the max() (i.e. the highest label = nr of components) + + The original idea was to then calculate a regression formula + between those two targets. But it is insufficient to just + approximate the same number of components, for they might be + completely different (true components being merged, predicted + components splitting others). We really want to capture the + correspondence between those labels, which is localised. + + For that we now calculate the label pairs and their counts. + Looking at the M,N incidence matrix, we want those counts + to be distributed orthogonally (ideally). So we compute a + singular value decomposition and compare the sum total of + singular values to the sum total of all label counts. The + rate of the two determines a measure of congruence. + + Moreover, for the case of artificial boundary segments around + regions, optionally introduced by the training extractor to + represent segment identity in the loss (and removed at runtime): + Reduce this class to background as well. + """ + def metric(y_true, y_pred): + if artificial: + # convert artificial border class to background + y_true = y_true[:, :, :, :artificial] + y_pred = y_pred[:, :, :, :artificial] + # [B, H, W, C] + l_true = tf.math.argmax(y_true, axis=-1) + l_pred = tf.math.argmax(y_pred, axis=-1) + # [B, H, W] + c_true = tf.cast(connected_components(l_true), tf.int64) + c_pred = tf.cast(connected_components(l_pred), tf.int64) + # [B, H, W] + n_batch = y_true.shape[0] + C_true = tf.math.reduce_max(c_true, (1, 2)) + 1 + C_pred = tf.math.reduce_max(c_pred, (1, 2)) + 1 + MODULUS = tf.constant(2**22, tf.int64) + tf.debugging.assert_less(C_true, MODULUS, + message="cannot compare segments: too many connected components in GT") + tf.debugging.assert_less(C_pred, MODULUS, + message="cannot compare segments: too many connected components in prediction") + c_comb = MODULUS * c_pred + c_true + tf.debugging.assert_greater_equal(c_comb, tf.constant(0, tf.int64), + message="overflow pairing components") + # [B, H, W] + # tf.unique does not support batch dim, so... + results = [] + for c_comb, C_true, C_pred in zip( + tf.unstack(c_comb, num=n_batch), + tf.unstack(C_true, num=n_batch), + tf.unstack(C_pred, num=n_batch), + ): + prod, _, count = tf.unique_with_counts(tf.reshape(c_comb, (-1,))) + # [L] + #corr = tf.zeros([C_pred, C_true], tf.int32) + #corr[prod // 2**24, prod % 2**24] = count + corr = tf.scatter_nd(tf.stack([prod // MODULUS, prod % MODULUS], axis=1), + count, (C_pred, C_true)) + corr = tf.cast(corr, tf.float32) + # [Cpred, Ctrue] + sgv = tf.linalg.svd(corr, compute_uv=False) + results.append(tf.reduce_sum(sgv) / tf.reduce_sum(corr)) + return 1.0 - tf.reduce_mean(tf.stack(results), 0) + # c_true = tf.reshape(c_true, (n_batch, -1)) + # c_pred = tf.reshape(c_pred, (n_batch, -1)) + # # [B, H*W] + # n_true = tf.math.reduce_max(c_true, axis=1) + # n_pred = tf.math.reduce_max(c_pred, axis=1) + # # [B] + # diff = tf.cast(n_true - n_pred, tf.float32) + # return tf.reduce_mean(tf.math.abs(diff) + alpha * diff, axis=-1) + + metric.__name__ = 'nCC' + metric._direction = 'down' + return metric + diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index efaa96e..f06c35b 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -16,7 +16,6 @@ from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard, EarlyStoppi from tensorflow.keras.layers import StringLookup from tensorflow.keras.utils import image_dataset_from_directory from tensorflow.keras.backend import one_hot -from tensorflow_addons.image import connected_components from sacred import Experiment from sacred.config import create_captured_function @@ -30,6 +29,7 @@ from .metrics import ( get as get_metric, metrics_superposition, ConfusionMatrix, + connected_components_loss, ) from .models import ( PatchEncoder, @@ -83,90 +83,6 @@ def configuration(): except: print("no GPU device available", file=sys.stderr) -def num_connected_components_regression(artificial=0): - """ - metric/loss function capturing the separability of segmentation maps - - For both sides (true and predicted, resp.), computes - 1. the argmax() of class-wise softmax input (i.e. the segmentation map) - 2. the connected components (i.e. the instance label map) - 3. the max() (i.e. the highest label = nr of components) - - The original idea was to then calculate a regression formula - between those two targets. But it is insufficient to just - approximate the same number of components, for they might be - completely different (true components being merged, predicted - components splitting others). We really want to capture the - correspondence between those labels, which is localised. - - For that we now calculate the label pairs and their counts. - Looking at the M,N incidence matrix, we want those counts - to be distributed orthogonally (ideally). So we compute a - singular value decomposition and compare the sum total of - singular values to the sum total of all label counts. The - rate of the two determines a measure of congruence. - - Moreover, for the case of artificial boundary segments around - regions, optionally introduced by the training extractor to - represent segment identity in the loss (and removed at runtime): - Reduce this class to background as well. - """ - def metric(y_true, y_pred): - if artificial: - # convert artificial border class to background - y_true = y_true[:, :, :, :artificial] - y_pred = y_pred[:, :, :, :artificial] - # [B, H, W, C] - l_true = tf.math.argmax(y_true, axis=-1) - l_pred = tf.math.argmax(y_pred, axis=-1) - # [B, H, W] - c_true = tf.cast(connected_components(l_true), tf.int64) - c_pred = tf.cast(connected_components(l_pred), tf.int64) - # [B, H, W] - #n_batch = tf.shape(y_true)[0] - n_batch = y_true.shape[0] - C_true = tf.math.reduce_max(c_true, (1, 2)) + 1 - C_pred = tf.math.reduce_max(c_pred, (1, 2)) + 1 - MODULUS = tf.constant(2**22, tf.int64) - tf.debugging.assert_less(C_true, MODULUS, - message="cannot compare segments: too many connected components in GT") - tf.debugging.assert_less(C_pred, MODULUS, - message="cannot compare segments: too many connected components in prediction") - c_comb = MODULUS * c_pred + c_true - tf.debugging.assert_greater_equal(c_comb, tf.constant(0, tf.int64), - message="overflow pairing components") - # [B, H, W] - # tf.unique does not support batch dim, so... - results = [] - for c_comb, C_true, C_pred in zip( - tf.unstack(c_comb, num=n_batch), - tf.unstack(C_true, num=n_batch), - tf.unstack(C_pred, num=n_batch), - ): - prod, _, count = tf.unique_with_counts(tf.reshape(c_comb, (-1,))) - #tf.print(n_batch, tf.shape(prod), C_true, C_true) - # [L] - #corr = tf.zeros([C_pred, C_true], tf.int32) - #corr[prod // 2**24, prod % 2**24] = count - corr = tf.scatter_nd(tf.stack([prod // MODULUS, prod % MODULUS], axis=1), - count, (C_pred, C_true)) - corr = tf.cast(corr, tf.float32) - # [Cpred, Ctrue] - sgv = tf.linalg.svd(corr, compute_uv=False) - results.append(tf.reduce_sum(sgv) / tf.reduce_sum(corr)) - return 1.0 - tf.reduce_mean(tf.stack(results), 0) - # c_true = tf.reshape(c_true, (n_batch, -1)) - # c_pred = tf.reshape(c_pred, (n_batch, -1)) - # # [B, H*W] - # n_true = tf.math.reduce_max(c_true, axis=1) - # n_pred = tf.math.reduce_max(c_pred, axis=1) - # # [B] - # diff = tf.cast(n_true - n_pred, tf.float32) - # return tf.reduce_mean(tf.math.abs(diff) + alpha * diff, axis=-1) - - metric.__name__ = 'nCC' - return metric - def plot_layout_tf(in_: tf.Tensor, out:tf.Tensor) -> tf.Tensor: """ Implements training.inference.SBBPredict.visualize_model_output for TF @@ -659,9 +575,9 @@ def run(_config, else: loss = get_metric('categorical_crossentropy') if add_ncc_loss: - loss = metrics_superposition(loss, num_connected_components_regression(n_classes - 1), + loss = metrics_superposition(loss, connected_components_loss(n_classes - 1), weights=[1 - add_ncc_loss, add_ncc_loss]) - metrics.append(num_connected_components_regression(n_classes - 1)) + metrics.append(connected_components_loss(n_classes - 1)) metrics.append(MeanIoU(n_classes, name='iou', ignore_class=0, From 3b56fa2a5b56bead190dda896c11cc8e6666f789 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 28 Feb 2026 20:08:10 +0100 Subject: [PATCH 116/118] training: plot GT/prediction and metrics before training (commented) --- src/eynollah/training/train.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index f06c35b..ff6865b 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -641,6 +641,33 @@ def run(_config, callbacks.append(SaveWeightsAfterSteps(save_interval, dir_output, _config)) train_gen = train_gen.shuffle(train_steps // 1000, reshuffle_each_iteration=True) valdn_gen = valdn_gen.shuffle(valdn_steps // 10, reshuffle_each_iteration=False) + # from matplotlib import pyplot as plt + # from tensorflow_addons.image import connected_components + # def plot(x, ytrue): + # ypred = model.call(x) + # gt = plot_layout_tf(x, ytrue) + # dt = plot_layout_tf(x, ypred) + # segtrue = tf.math.argmax(ytrue, axis=-1) + # segpred = tf.math.argmax(ypred, axis=-1) + # cctrue = connected_components(segtrue) + # ccpred = connected_components(segpred) + # cc = connected_components_loss(n_classes-1)(ytrue, ypred) + # sd = soft_dice_loss(ytrue, ypred) + # return gt, dt, cctrue, ccpred, cc, sd + # for gt, dt, gtcc, dtcc, cc, sd in train_gen.take(15).rebatch(1).map(plot).as_numpy_iterator(): + # plt.subplot(2, 2, 1) + # plt.imshow(np.squeeze(gt)) + # plt.title('GT') + # plt.subplot(2, 2, 3) + # plt.imshow(np.squeeze(gtcc)) + # plt.title('GT CC') + # plt.subplot(2, 2, 4) + # plt.imshow(np.squeeze(dtcc)) + # plt.title('prediction CC') + # plt.subplot(2, 2, 2) + # plt.imshow(np.squeeze(dt)) + # plt.title(f'prediction (nCC={cc} soft dice={sd:.3f})') + # plt.show() model.fit( train_gen.prefetch(tf.data.AUTOTUNE), steps_per_epoch=train_steps, From 686f1d34aa6037c2f604d100a454f96ddf565a96 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 1 Mar 2026 04:37:20 +0100 Subject: [PATCH 117/118] do_prediction*: simplify (esp. indexing/slicing) --- src/eynollah/eynollah.py | 432 +++++++++++---------------------------- 1 file changed, 116 insertions(+), 316 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index d089511..5cad8a0 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -724,25 +724,21 @@ class Eynollah: if thresholding_for_some_classes_in_light_version: seg_not_base = label_p_pred[:,:,:,4] - seg_not_base[seg_not_base>0.03] =1 - seg_not_base[seg_not_base<1] =0 + seg_not_base = (seg_not_base > 0.03).astype(int) seg_line = label_p_pred[:,:,:,3] - seg_line[seg_line>0.1] =1 - seg_line[seg_line<1] =0 + seg_line = (seg_line > 0.1).astype(int) seg_background = label_p_pred[:,:,:,0] - seg_background[seg_background>0.25] =1 - seg_background[seg_background<1] =0 + seg_background = (seg_background > 0.25).astype(int) seg[seg_not_base==1]=4 seg[seg_background==1]=0 seg[(seg_line==1) & (seg==0)]=3 + if thresholding_for_artificial_class_in_light_version: seg_art = label_p_pred[:,:,:,2] - - seg_art[seg_art0] =1 + seg_art = (seg_art >= threshold_art_class_textline).astype(int) ##seg[seg_art==1]=2 @@ -759,113 +755,51 @@ class Eynollah: index_x_u_in = list_x_u[indexer_inside_batch] index_x_d_in = list_x_d[indexer_inside_batch] - if i_batch == 0 and j_batch == 0: - prediction_true[index_y_d_in + 0:index_y_u_in - margin, - index_x_d_in + 0:index_x_u_in - margin] = \ - seg_in[0:-margin or None, - 0:-margin or None, - np.newaxis] - if thresholding_for_artificial_class_in_light_version: - prediction_true[index_y_d_in + 0:index_y_u_in - margin, - index_x_d_in + 0:index_x_u_in - margin, 1] = \ - seg_in_art[0:-margin or None, - 0:-margin or None] - - elif i_batch == nxf - 1 and j_batch == nyf - 1: - prediction_true[index_y_d_in + margin:index_y_u_in - 0, - index_x_d_in + margin:index_x_u_in - 0] = \ - seg_in[margin:, - margin:, - np.newaxis] - if thresholding_for_artificial_class_in_light_version: - prediction_true[index_y_d_in + margin:index_y_u_in - 0, - index_x_d_in + margin:index_x_u_in - 0, 1] = \ - seg_in_art[margin:, - margin:] - - elif i_batch == 0 and j_batch == nyf - 1: - prediction_true[index_y_d_in + margin:index_y_u_in - 0, - index_x_d_in + 0:index_x_u_in - margin] = \ - seg_in[margin:, - 0:-margin or None, - np.newaxis] - if thresholding_for_artificial_class_in_light_version: - prediction_true[index_y_d_in + margin:index_y_u_in - 0, - index_x_d_in + 0:index_x_u_in - margin, 1] = \ - seg_in_art[margin:, - 0:-margin or None] - - elif i_batch == nxf - 1 and j_batch == 0: - prediction_true[index_y_d_in + 0:index_y_u_in - margin, - index_x_d_in + margin:index_x_u_in - 0] = \ - seg_in[0:-margin or None, - margin:, - np.newaxis] - if thresholding_for_artificial_class_in_light_version: - prediction_true[index_y_d_in + 0:index_y_u_in - margin, - index_x_d_in + margin:index_x_u_in - 0, 1] = \ - seg_in_art[0:-margin or None, - margin:] - - elif i_batch == 0 and j_batch != 0 and j_batch != nyf - 1: - prediction_true[index_y_d_in + margin:index_y_u_in - margin, - index_x_d_in + 0:index_x_u_in - margin] = \ - seg_in[margin:-margin or None, - 0:-margin or None, - np.newaxis] - if thresholding_for_artificial_class_in_light_version: - prediction_true[index_y_d_in + margin:index_y_u_in - margin, - index_x_d_in + 0:index_x_u_in - margin, 1] = \ - seg_in_art[margin:-margin or None, - 0:-margin or None] - - elif i_batch == nxf - 1 and j_batch != 0 and j_batch != nyf - 1: - prediction_true[index_y_d_in + margin:index_y_u_in - margin, - index_x_d_in + margin:index_x_u_in - 0] = \ - seg_in[margin:-margin or None, - margin:, - np.newaxis] - if thresholding_for_artificial_class_in_light_version: - prediction_true[index_y_d_in + margin:index_y_u_in - margin, - index_x_d_in + margin:index_x_u_in - 0, 1] = \ - seg_in_art[margin:-margin or None, - margin:] - - elif i_batch != 0 and i_batch != nxf - 1 and j_batch == 0: - prediction_true[index_y_d_in + 0:index_y_u_in - margin, - index_x_d_in + margin:index_x_u_in - margin] = \ - seg_in[0:-margin or None, - margin:-margin or None, - np.newaxis] - if thresholding_for_artificial_class_in_light_version: - prediction_true[index_y_d_in + 0:index_y_u_in - margin, - index_x_d_in + margin:index_x_u_in - margin, 1] = \ - seg_in_art[0:-margin or None, - margin:-margin or None] - - elif i_batch != 0 and i_batch != nxf - 1 and j_batch == nyf - 1: - prediction_true[index_y_d_in + margin:index_y_u_in - 0, - index_x_d_in + margin:index_x_u_in - margin] = \ - seg_in[margin:, - margin:-margin or None, - np.newaxis] - if thresholding_for_artificial_class_in_light_version: - prediction_true[index_y_d_in + margin:index_y_u_in - 0, - index_x_d_in + margin:index_x_u_in - margin, 1] = \ - seg_in_art[margin:, - margin:-margin or None] - + where = np.index_exp[index_y_d_in:index_y_u_in, + index_x_d_in:index_x_u_in] + if (i_batch == 0 and + j_batch == 0): + inbox = np.index_exp[0:-margin or None, + 0:-margin or None] + elif (i_batch == nxf - 1 and + j_batch == nyf - 1): + inbox = np.index_exp[margin:, + margin:] + elif (i_batch == 0 and + j_batch == nyf - 1): + inbox = np.index_exp[margin:, + 0:-margin or None] + elif (i_batch == nxf - 1 and + j_batch == 0): + inbox = np.index_exp[0:-margin or None, + margin:] + elif (i_batch == 0 and + j_batch != 0 and + j_batch != nyf - 1): + inbox = np.index_exp[margin:-margin or None, + 0:-margin or None] + elif (i_batch == nxf - 1 and + j_batch != 0 and + j_batch != nyf - 1): + inbox = np.index_exp[margin:-margin or None, + margin:] + elif (i_batch != 0 and + i_batch != nxf - 1 and + j_batch == 0): + inbox = np.index_exp[0:-margin or None, + margin:-margin or None] + elif (i_batch != 0 and + i_batch != nxf - 1 and + j_batch == nyf - 1): + inbox = np.index_exp[margin:, + margin:-margin or None] else: - prediction_true[index_y_d_in + margin:index_y_u_in - margin, - index_x_d_in + margin:index_x_u_in - margin] = \ - seg_in[margin:-margin or None, - margin:-margin or None, - np.newaxis] - if thresholding_for_artificial_class_in_light_version: - prediction_true[index_y_d_in + margin:index_y_u_in - margin, - index_x_d_in + margin:index_x_u_in - margin, 1] = \ - seg_in_art[margin:-margin or None, - margin:-margin or None] + inbox = np.index_exp[margin:-margin or None, + margin:-margin or None] + prediction_true[where][inbox] = seg_in[inbox + (np.newaxis,)] + if thresholding_for_artificial_class_in_light_version: + prediction_true[where][inbox + (1,)] = seg_in_art[inbox] + indexer_inside_batch += 1 @@ -885,11 +819,7 @@ class Eynollah: kernel_min = np.ones((3, 3), np.uint8) prediction_true[:,:,0][prediction_true[:,:,0]==2] = 0 - skeleton_art = skeletonize(prediction_true[:,:,1]) - skeleton_art = skeleton_art*1 - - skeleton_art = skeleton_art.astype('uint8') - + skeleton_art = skeletonize(prediction_true[:,:,1]).astype(np.uint8) skeleton_art = cv2.dilate(skeleton_art, kernel_min, iterations=1) prediction_true[:,:,0][skeleton_art==1]=2 @@ -924,18 +854,13 @@ class Eynollah: if thresholding_for_artificial_class_in_light_version: kernel_min = np.ones((3, 3), np.uint8) seg_art = label_p_pred[0,:,:,4] - seg_art[seg_art0] =1 + seg_art = (seg_art >= threshold_art_class_layout).astype(int) #seg[seg_art==1]=4 seg_art = resize_image(seg_art, img_h_page, img_w_page).astype(np.uint8) prediction_true[:,:,0][prediction_true[:,:,0]==4] = 0 - skeleton_art = skeletonize(seg_art) - skeleton_art = skeleton_art*1 - - skeleton_art = skeleton_art.astype('uint8') - + skeleton_art = skeletonize(seg_art).astype(np.uint8) skeleton_art = cv2.dilate(skeleton_art, kernel_min, iterations=1) prediction_true[:,:,0][skeleton_art==1] = 4 @@ -948,6 +873,8 @@ class Eynollah: img = resize_image(img, img.shape[0], img_width_model) self.logger.debug("Patch size: %sx%s", img_height_model, img_width_model) + thresholding = (thresholding_for_artificial_class_in_light_version or + thresholding_for_some_classes_in_light_version) margin = int(marginal_of_patch_percent * img_height_model) width_mid = img_width_model - 2 * margin height_mid = img_height_model - 2 * margin @@ -974,18 +901,10 @@ class Eynollah: img_patch = np.zeros((n_batch_inference, img_height_model, img_width_model, 3)) for i in range(nxf): for j in range(nyf): - if i == 0: - index_x_d = i * width_mid - index_x_u = index_x_d + img_width_model - else: - index_x_d = i * width_mid - index_x_u = index_x_d + img_width_model - if j == 0: - index_y_d = j * height_mid - index_y_u = index_y_d + img_height_model - else: - index_y_d = j * height_mid - index_y_u = index_y_d + img_height_model + index_x_d = i * width_mid + index_x_u = index_x_d + img_width_model + index_y_d = j * height_mid + index_y_u = index_y_d + img_height_model if index_x_u > img_w: index_x_u = img_w index_x_d = img_w - img_width_model @@ -1000,7 +919,8 @@ class Eynollah: list_y_d.append(index_y_d) list_y_u.append(index_y_u) - img_patch[batch_indexer] = img[index_y_d:index_y_u, index_x_d:index_x_u] + img_patch[batch_indexer] = img[index_y_d:index_y_u, + index_x_d:index_x_u] batch_indexer += 1 if (batch_indexer == n_batch_inference or @@ -1012,29 +932,25 @@ class Eynollah: if thresholding_for_some_classes_in_light_version: seg_art = label_p_pred[:,:,:,4] - seg_art[seg_art0] =1 + + seg_art = (seg_art >= threshold_art_class_layout).astype(int) seg_line = label_p_pred[:,:,:,3] - seg_line[seg_line>0.4] =1#seg_line[seg_line>0.5] =1#seg_line[seg_line>0.1] =1 - seg_line[seg_line<1] =0 + seg_line = (seg_line > 0.4).astype(int) ##seg[seg_art==1]=4 #seg[(seg_line==1) & (seg==0)]=3 if thresholding_for_artificial_class_in_light_version: seg_art = label_p_pred[:,:,:,2] - seg_art[seg_art0] =1 - + seg_art = (seg_art >= threshold_art_class_textline).astype(int) ##seg[seg_art==1]=2 indexer_inside_batch = 0 for i_batch, j_batch in zip(list_i_s, list_j_s): seg_in = seg[indexer_inside_batch] - if (thresholding_for_artificial_class_in_light_version or - thresholding_for_some_classes_in_light_version): + if thresholding: seg_in_art = seg_art[indexer_inside_batch] index_y_u_in = list_y_u[indexer_inside_batch] @@ -1043,164 +959,56 @@ class Eynollah: index_x_u_in = list_x_u[indexer_inside_batch] index_x_d_in = list_x_d[indexer_inside_batch] - if i_batch == 0 and j_batch == 0: - prediction_true[index_y_d_in + 0:index_y_u_in - margin, - index_x_d_in + 0:index_x_u_in - margin] = \ - seg_in[0:-margin or None, - 0:-margin or None, - np.newaxis] - confidence_matrix[index_y_d_in + 0:index_y_u_in - margin, - index_x_d_in + 0:index_x_u_in - margin] = \ - label_p_pred[0, 0:-margin or None, - 0:-margin or None, - 1] - if (thresholding_for_artificial_class_in_light_version or - thresholding_for_some_classes_in_light_version): - prediction_true[index_y_d_in + 0:index_y_u_in - margin, - index_x_d_in + 0:index_x_u_in - margin, 1] = \ - seg_in_art[0:-margin or None, - 0:-margin or None] - - elif i_batch == nxf - 1 and j_batch == nyf - 1: - prediction_true[index_y_d_in + margin:index_y_u_in - 0, - index_x_d_in + margin:index_x_u_in - 0] = \ - seg_in[margin:, - margin:, - np.newaxis] - confidence_matrix[index_y_d_in + margin:index_y_u_in - 0, - index_x_d_in + margin:index_x_u_in - 0] = \ - label_p_pred[0, margin:, - margin:, - 1] - if (thresholding_for_artificial_class_in_light_version or - thresholding_for_some_classes_in_light_version): - prediction_true[index_y_d_in + margin:index_y_u_in - 0, - index_x_d_in + margin:index_x_u_in - 0, 1] = \ - seg_in_art[margin:, - margin:] - - elif i_batch == 0 and j_batch == nyf - 1: - prediction_true[index_y_d_in + margin:index_y_u_in - 0, - index_x_d_in + 0:index_x_u_in - margin] = \ - seg_in[margin:, - 0:-margin or None, - np.newaxis] - confidence_matrix[index_y_d_in + margin:index_y_u_in - 0, - index_x_d_in + 0:index_x_u_in - margin] = \ - label_p_pred[0, margin:, - 0:-margin or None, - 1] - - if (thresholding_for_artificial_class_in_light_version or - thresholding_for_some_classes_in_light_version): - prediction_true[index_y_d_in + margin:index_y_u_in - 0, - index_x_d_in + 0:index_x_u_in - margin, 1] = \ - seg_in_art[margin:, - 0:-margin or None] - - elif i_batch == nxf - 1 and j_batch == 0: - prediction_true[index_y_d_in + 0:index_y_u_in - margin, - index_x_d_in + margin:index_x_u_in - 0] = \ - seg_in[0:-margin or None, - margin:, - np.newaxis] - confidence_matrix[index_y_d_in + 0:index_y_u_in - margin, - index_x_d_in + margin:index_x_u_in - 0] = \ - label_p_pred[0, 0:-margin or None, - margin:, - 1] - if (thresholding_for_artificial_class_in_light_version or - thresholding_for_some_classes_in_light_version): - prediction_true[index_y_d_in + 0:index_y_u_in - margin, - index_x_d_in + margin:index_x_u_in - 0, 1] = \ - seg_in_art[0:-margin or None, - margin:] - - elif i_batch == 0 and j_batch != 0 and j_batch != nyf - 1: - prediction_true[index_y_d_in + margin:index_y_u_in - margin, - index_x_d_in + 0:index_x_u_in - margin] = \ - seg_in[margin:-margin or None, - 0:-margin or None, - np.newaxis] - confidence_matrix[index_y_d_in + margin:index_y_u_in - margin, - index_x_d_in + 0:index_x_u_in - margin] = \ - label_p_pred[0, margin:-margin or None, - 0:-margin or None, - 1] - if (thresholding_for_artificial_class_in_light_version or - thresholding_for_some_classes_in_light_version): - prediction_true[index_y_d_in + margin:index_y_u_in - margin, - index_x_d_in + 0:index_x_u_in - margin, 1] = \ - seg_in_art[margin:-margin or None, - 0:-margin or None] - elif i_batch == nxf - 1 and j_batch != 0 and j_batch != nyf - 1: - prediction_true[index_y_d_in + margin:index_y_u_in - margin, - index_x_d_in + margin:index_x_u_in - 0] = \ - seg_in[margin:-margin or None, - margin:, - np.newaxis] - confidence_matrix[index_y_d_in + margin:index_y_u_in - margin, - index_x_d_in + margin:index_x_u_in - 0] = \ - label_p_pred[0, margin:-margin or None, - margin:, - 1] - if (thresholding_for_artificial_class_in_light_version or - thresholding_for_some_classes_in_light_version): - prediction_true[index_y_d_in + margin:index_y_u_in - margin, - index_x_d_in + margin:index_x_u_in - 0, 1] = \ - seg_in_art[margin:-margin or None, - margin:] - elif i_batch != 0 and i_batch != nxf - 1 and j_batch == 0: - prediction_true[index_y_d_in + 0:index_y_u_in - margin, - index_x_d_in + margin:index_x_u_in - margin] = \ - seg_in[0:-margin or None, - margin:-margin or None, - np.newaxis] - confidence_matrix[index_y_d_in + 0:index_y_u_in - margin, - index_x_d_in + margin:index_x_u_in - margin] = \ - label_p_pred[0, 0:-margin or None, - margin:-margin or None, - 1] - if (thresholding_for_artificial_class_in_light_version or - thresholding_for_some_classes_in_light_version): - prediction_true[index_y_d_in + 0:index_y_u_in - margin, - index_x_d_in + margin:index_x_u_in - margin, 1] = \ - seg_in_art[0:-margin or None, - margin:-margin or None] - elif i_batch != 0 and i_batch != nxf - 1 and j_batch == nyf - 1: - prediction_true[index_y_d_in + margin:index_y_u_in - 0, - index_x_d_in + margin:index_x_u_in - margin] = \ - seg_in[margin:, - margin:-margin or None, - np.newaxis] - confidence_matrix[index_y_d_in + margin:index_y_u_in - 0, - index_x_d_in + margin:index_x_u_in - margin] = \ - label_p_pred[0, margin:, - margin:-margin or None, - 1] - if (thresholding_for_artificial_class_in_light_version or - thresholding_for_some_classes_in_light_version): - prediction_true[index_y_d_in + margin:index_y_u_in - 0, - index_x_d_in + margin:index_x_u_in - margin, 1] = \ - seg_in_art[margin:, - margin:-margin or None] + where = np.index_exp[index_y_d_in:index_y_u_in, + index_x_d_in:index_x_u_in] + if (i_batch == 0 and + j_batch == 0): + inbox = np.index_exp[0:-margin or None, + 0:-margin or None] + elif (i_batch == nxf - 1 and + j_batch == nyf - 1): + inbox = np.index_exp[margin:, + margin:] + elif (i_batch == 0 and + j_batch == nyf - 1): + inbox = np.index_exp[margin:, + 0:-margin or None] + elif (i_batch == nxf - 1 and + j_batch == 0): + inbox = np.index_exp[0:-margin or None, + margin:] + elif (i_batch == 0 and + j_batch != 0 and + j_batch != nyf - 1): + inbox = np.index_exp[margin:-margin or None, + 0:-margin or None] + elif (i_batch == nxf - 1 and + j_batch != 0 and + j_batch != nyf - 1): + inbox = np.index_exp[margin:-margin or None, + margin:] + elif (i_batch != 0 and + i_batch != nxf - 1 and + j_batch == 0): + inbox = np.index_exp[0:-margin or None, + margin:-margin or None] + elif (i_batch != 0 and + i_batch != nxf - 1 and + j_batch == nyf - 1): + inbox = np.index_exp[margin:, + margin:-margin or None] else: - prediction_true[index_y_d_in + margin:index_y_u_in - margin, - index_x_d_in + margin:index_x_u_in - margin] = \ - seg_in[margin:-margin or None, - margin:-margin or None, - np.newaxis] - confidence_matrix[index_y_d_in + margin:index_y_u_in - margin, - index_x_d_in + margin:index_x_u_in - margin] = \ - label_p_pred[0, margin:-margin or None, - margin:-margin or None, - 1] - if (thresholding_for_artificial_class_in_light_version or - thresholding_for_some_classes_in_light_version): - prediction_true[index_y_d_in + margin:index_y_u_in - margin, - index_x_d_in + margin:index_x_u_in - margin, 1] = \ - seg_in_art[margin:-margin or None, - margin:-margin or None] + inbox = np.index_exp[margin:-margin or None, + margin:-margin or None] + prediction_true[where][inbox] = seg_in[inbox + (np.newaxis,)] + confidence_matrix[where][inbox] = label_p_pred[(0,) + inbox + (1,)] + # rs: why is prediction_true 3ch when only 1st gets used? + # artificial boundary class map should be extra array + # rs: why does confidence_matrix only get text-label scores? + # should be scores at final argmax + if thresholding: + prediction_true[where][inbox + (1,)] = seg_in_art[inbox] + indexer_inside_batch += 1 list_i_s = [] @@ -1219,11 +1027,7 @@ class Eynollah: kernel_min = np.ones((3, 3), np.uint8) prediction_true[:,:,0][prediction_true[:,:,0]==2] = 0 - skeleton_art = skeletonize(prediction_true[:,:,1]) - skeleton_art = skeleton_art*1 - - skeleton_art = skeleton_art.astype('uint8') - + skeleton_art = skeletonize(prediction_true[:,:,1]).astype(np.uint8) skeleton_art = cv2.dilate(skeleton_art, kernel_min, iterations=1) prediction_true[:,:,0][skeleton_art==1]=2 @@ -1232,11 +1036,7 @@ class Eynollah: kernel_min = np.ones((3, 3), np.uint8) prediction_true[:,:,0][prediction_true[:,:,0]==4] = 0 - skeleton_art = skeletonize(prediction_true[:,:,1]) - skeleton_art = skeleton_art*1 - - skeleton_art = skeleton_art.astype('uint8') - + skeleton_art = skeletonize(prediction_true[:,:,1]).astype(np.uint8) skeleton_art = cv2.dilate(skeleton_art, kernel_min, iterations=1) prediction_true[:,:,0][skeleton_art==1]=4 From b9cf68b51acb5202c8ecbe222ee4a73a63040291 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 1 Mar 2026 20:00:05 +0100 Subject: [PATCH 118/118] training: fix b6d2440c --- src/eynollah/training/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py index ff6865b..4542297 100644 --- a/src/eynollah/training/train.py +++ b/src/eynollah/training/train.py @@ -491,7 +491,7 @@ def run(_config, if data_is_provided: dirs = dir_flow_train_labels else: - dirs = dir_seg + dirs = os.path.join(dir_train, "labels") for obj in os.listdir(dirs): label_file = os.path.join(dirs, + obj) try: