From e9bb62bd86747dabd5cd6fb5f67a36547c5c626d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 2 Oct 2025 23:44:00 +0200 Subject: [PATCH] do_order_of_regions: simplify - avoid loops in favour of array processing --- src/eynollah/eynollah.py | 158 ++++++++++++++++----------------------- 1 file changed, 64 insertions(+), 94 deletions(-) diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 3194b66..6a3fd1e 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -2526,7 +2526,7 @@ class Eynollah: contours_only_text_parent_h) try: - arg_text_con = [] + arg_text_con_main = np.zeros(len(contours_only_text_parent), dtype=int) for ii in range(len(contours_only_text_parent)): check_if_textregion_located_in_a_box = False for jj, box in enumerate(boxes): @@ -2534,7 +2534,7 @@ class Eynollah: Mx_main[ii] < box[1] and my_main[ii] >= box[2] and My_main[ii] < box[3]): - arg_text_con.append(jj) + arg_text_con_main[ii] = jj check_if_textregion_located_in_a_box = True break if not check_if_textregion_located_in_a_box: @@ -2545,11 +2545,11 @@ class Eynollah: pcontained_in_box = ((boxes[:, 2] <= cy_main[ii]) & (cy_main[ii] < boxes[:, 3]) & (boxes[:, 0] <= cx_main[ii]) & (cx_main[ii] < boxes[:, 1])) ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) - arg_text_con.append(ind_min) - args_contours = np.arange(len(arg_text_con)) - order_by_con_main = np.zeros(len(arg_text_con)) + arg_text_con_main[ii] = ind_min + args_contours_main = np.arange(len(contours_only_text_parent)) + order_by_con_main = np.zeros_like(arg_text_con_main) - arg_text_con_h = [] + arg_text_con_head = np.zeros(len(contours_only_text_parent_h), dtype=int) for ii in range(len(contours_only_text_parent_h)): check_if_textregion_located_in_a_box = False for jj, box in enumerate(boxes): @@ -2557,7 +2557,7 @@ class Eynollah: Mx_head[ii] < box[1] and my_head[ii] >= box[2] and My_head[ii] < box[3]): - arg_text_con_h.append(jj) + arg_text_con_head[ii] = jj check_if_textregion_located_in_a_box = True break if not check_if_textregion_located_in_a_box: @@ -2568,9 +2568,9 @@ class Eynollah: pcontained_in_box = ((boxes[:, 2] <= cy_head[ii]) & (cy_head[ii] < boxes[:, 3]) & (boxes[:, 0] <= cx_head[ii]) & (cx_head[ii] < boxes[:, 1])) ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) - arg_text_con_h.append(ind_min) - args_contours_h = np.arange(len(arg_text_con_h)) - order_by_con_head = np.zeros(len(arg_text_con_h)) + arg_text_con_head[ii] = ind_min + args_contours_head = np.arange(len(contours_only_text_parent_h)) + order_by_con_head = np.zeros_like(arg_text_con_head) ref_point = 0 order_of_texts_tot = [] @@ -2578,10 +2578,10 @@ class Eynollah: for iij, box in enumerate(boxes): ys = slice(*box[2:4]) xs = slice(*box[0:2]) - args_contours_box = args_contours[np.array(arg_text_con) == iij] - args_contours_box_h = args_contours_h[np.array(arg_text_con_h) == iij] - con_inter_box = contours_only_text_parent[args_contours_box] - con_inter_box_h = contours_only_text_parent_h[args_contours_box_h] + args_contours_box_main = args_contours_main[arg_text_con_main == iij] + args_contours_box_head = args_contours_head[arg_text_con_head == iij] + con_inter_box = contours_only_text_parent[args_contours_box_main] + con_inter_box_h = contours_only_text_parent_h[args_contours_box_head] indexes_sorted, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, box[2]) @@ -2595,14 +2595,14 @@ class Eynollah: indexes_sorted_head = indexes_sorted[kind_of_texts_sorted == 2] indexes_by_type_head = index_by_kind_sorted[kind_of_texts_sorted == 2] - for zahler, _ in enumerate(args_contours_box): + for zahler, _ in enumerate(args_contours_box_main): arg_order_v = indexes_sorted_main[zahler] - order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = \ + order_by_con_main[args_contours_box_main[indexes_by_type_main[zahler]]] = \ np.flatnonzero(indexes_sorted == arg_order_v) + ref_point - for zahler, _ in enumerate(args_contours_box_h): + for zahler, _ in enumerate(args_contours_box_head): arg_order_v = indexes_sorted_head[zahler] - order_by_con_head[args_contours_box_h[indexes_by_type_head[zahler]]] = \ + order_by_con_head[args_contours_box_head[indexes_by_type_head[zahler]]] = \ np.flatnonzero(indexes_sorted == arg_order_v) + ref_point for jji in range(len(id_of_texts)): @@ -2610,20 +2610,13 @@ class Eynollah: id_of_texts_tot.append(id_of_texts[jji]) ref_point += len(id_of_texts) - order_of_texts_tot = [] - for tj1 in range(len(contours_only_text_parent)): - order_of_texts_tot.append(int(order_by_con_main[tj1])) - - for tj1 in range(len(contours_only_text_parent_h)): - order_of_texts_tot.append(int(order_by_con_head[tj1])) - - order_text_new = [] - for iii in range(len(order_of_texts_tot)): - order_text_new.append(np.flatnonzero(np.array(order_of_texts_tot) == iii)) + order_of_texts_tot = np.concatenate((order_by_con_main, + order_by_con_head)) + order_text_new = np.argsort(order_of_texts_tot) except Exception as why: self.logger.error(why) - arg_text_con = [] + arg_text_con_main = np.zeros(len(contours_only_text_parent), dtype=int) for ii in range(len(contours_only_text_parent)): check_if_textregion_located_in_a_box = False for jj, box in enumerate(boxes): @@ -2632,10 +2625,9 @@ class Eynollah: cy_main[ii] >= box[2] and cy_main[ii] < box[3]): # this is valid if the center of region identify in which box it is located - arg_text_con.append(jj) + arg_text_con_main[ii] = jj check_if_textregion_located_in_a_box = True break - if not check_if_textregion_located_in_a_box: # dists_tr_from_box = [math.sqrt((cx_main[ii] - 0.5 * box[1] - 0.5 * box[0]) ** 2 + # (cy_main[ii] - 0.5 * box[3] - 0.5 * box[2]) ** 2) @@ -2644,13 +2636,11 @@ class Eynollah: pcontained_in_box = ((boxes[:, 2] <= cy_main[ii]) & (cy_main[ii] < boxes[:, 3]) & (boxes[:, 0] <= cx_main[ii]) & (cx_main[ii] < boxes[:, 1])) ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) - arg_text_con.append(ind_min) - args_contours = np.arange(len(arg_text_con)) - order_by_con_main = np.zeros(len(arg_text_con)) + arg_text_con_main[ii] = ind_min + args_contours_main = np.arange(len(contours_only_text_parent)) + order_by_con_main = np.zeros_like(arg_text_con_main) - ############################# head - - arg_text_con_h = [] + arg_text_con_head = np.zeros(len(contours_only_text_parent_h), dtype=int) for ii in range(len(contours_only_text_parent_h)): check_if_textregion_located_in_a_box = False for jj, box in enumerate(boxes): @@ -2659,7 +2649,7 @@ class Eynollah: cy_head[ii] >= box[2] and cy_head[ii] < box[3]): # this is valid if the center of region identify in which box it is located - arg_text_con_h.append(jj) + arg_text_con_head[ii] = jj check_if_textregion_located_in_a_box = True break if not check_if_textregion_located_in_a_box: @@ -2670,9 +2660,9 @@ class Eynollah: pcontained_in_box = ((boxes[:, 2] <= cy_head[ii]) & (cy_head[ii] < boxes[:, 3]) & (boxes[:, 0] <= cx_head[ii]) & (cx_head[ii] < boxes[:, 1])) ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) - arg_text_con_h.append(ind_min) - args_contours_h = np.arange(len(arg_text_con_h)) - order_by_con_head = np.zeros(len(arg_text_con_h)) + arg_text_con_head[ii] = ind_min + args_contours_head = np.arange(len(contours_only_text_parent_h)) + order_by_con_head = np.zeros_like(arg_text_con_head) ref_point = 0 order_of_texts_tot = [] @@ -2680,10 +2670,10 @@ class Eynollah: for iij, box in enumerate(boxes): ys = slice(*box[2:4]) xs = slice(*box[0:2]) - args_contours_box = args_contours[np.array(arg_text_con) == iij] - args_contours_box_h = args_contours_h[np.array(arg_text_con_h) == iij] - con_inter_box = contours_only_text_parent[args_contours_box] - con_inter_box_h = contours_only_text_parent_h[args_contours_box_h] + args_contours_box_main = args_contours_main[arg_text_con_main == iij] + args_contours_box_head = args_contours_head[arg_text_con_head == iij] + con_inter_box = contours_only_text_parent[args_contours_box_main] + con_inter_box_h = contours_only_text_parent_h[args_contours_box_head] indexes_sorted, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, box[2]) @@ -2697,14 +2687,14 @@ class Eynollah: indexes_sorted_head = indexes_sorted[kind_of_texts_sorted == 2] indexes_by_type_head = index_by_kind_sorted[kind_of_texts_sorted == 2] - for zahler, _ in enumerate(args_contours_box): + for zahler, _ in enumerate(args_contours_box_main): arg_order_v = indexes_sorted_main[zahler] - order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = \ + order_by_con_main[args_contours_box_main[indexes_by_type_main[zahler]]] = \ np.flatnonzero(indexes_sorted == arg_order_v) + ref_point - for zahler, _ in enumerate(args_contours_box_h): + for zahler, _ in enumerate(args_contours_box_head): arg_order_v = indexes_sorted_head[zahler] - order_by_con_head[args_contours_box_h[indexes_by_type_head[zahler]]] = \ + order_by_con_head[args_contours_box_head[indexes_by_type_head[zahler]]] = \ np.flatnonzero(indexes_sorted == arg_order_v) + ref_point for jji in range(len(id_of_texts)): @@ -2712,16 +2702,9 @@ class Eynollah: id_of_texts_tot.append(id_of_texts[jji]) ref_point += len(id_of_texts) - order_of_texts_tot = [] - for tj1 in range(len(contours_only_text_parent)): - order_of_texts_tot.append(int(order_by_con_main[tj1])) - - for tj1 in range(len(contours_only_text_parent_h)): - order_of_texts_tot.append(int(order_by_con_head[tj1])) - - order_text_new = [] - for iii in range(len(order_of_texts_tot)): - order_text_new.append(np.flatnonzero(np.array(order_of_texts_tot) == iii)) + order_of_texts_tot = np.concatenate((order_by_con_main, + order_by_con_head)) + order_text_new = np.argsort(order_of_texts_tot) self.logger.debug("exit do_order_of_regions_full_layout") return order_text_new, id_of_texts_tot @@ -2739,7 +2722,7 @@ class Eynollah: contours_only_text_parent) try: - arg_text_con = [] + arg_text_con_main = np.zeros(len(contours_only_text_parent), dtype=int) for ii in range(len(contours_only_text_parent)): check_if_textregion_located_in_a_box = False for jj, box in enumerate(boxes): @@ -2747,7 +2730,7 @@ class Eynollah: Mx_main[ii] < box[1] and my_main[ii] >= box[2] and My_main[ii] < box[3]): - arg_text_con.append(jj) + arg_text_con_main[ii] = jj check_if_textregion_located_in_a_box = True break if not check_if_textregion_located_in_a_box: @@ -2758,9 +2741,9 @@ class Eynollah: pcontained_in_box = ((boxes[:, 2] <= cy_main[ii]) & (cy_main[ii] < boxes[:, 3]) & (boxes[:, 0] <= cx_main[ii]) & (cx_main[ii] < boxes[:, 1])) ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) - arg_text_con.append(ind_min) - args_contours = np.arange(len(arg_text_con)) - order_by_con_main = np.zeros(len(arg_text_con)) + arg_text_con_main[ii] = ind_min + args_contours_main = np.arange(len(contours_only_text_parent)) + order_by_con_main = np.zeros_like(arg_text_con_main) ref_point = 0 order_of_texts_tot = [] @@ -2768,8 +2751,8 @@ class Eynollah: for iij, box in enumerate(boxes): ys = slice(*box[2:4]) xs = slice(*box[0:2]) - args_contours_box = args_contours[np.array(arg_text_con) == iij] - con_inter_box = contours_only_text_parent[args_contours_box] + args_contours_box_main = args_contours_main[arg_text_con_main == iij] + con_inter_box = contours_only_text_parent[args_contours_box_main] con_inter_box_h = [] indexes_sorted, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( @@ -2782,9 +2765,9 @@ class Eynollah: indexes_sorted_main = indexes_sorted[kind_of_texts_sorted == 1] indexes_by_type_main = index_by_kind_sorted[kind_of_texts_sorted == 1] - for zahler, _ in enumerate(args_contours_box): + for zahler, _ in enumerate(args_contours_box_main): arg_order_v = indexes_sorted_main[zahler] - order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = \ + order_by_con_main[args_contours_box_main[indexes_by_type_main[zahler]]] = \ np.flatnonzero(indexes_sorted == arg_order_v) + ref_point for jji, _ in enumerate(id_of_texts): @@ -2792,17 +2775,12 @@ class Eynollah: id_of_texts_tot.append(id_of_texts[jji]) ref_point += len(id_of_texts) - order_of_texts_tot = [] - for tj1 in range(len(contours_only_text_parent)): - order_of_texts_tot.append(int(order_by_con_main[tj1])) - - order_text_new = [] - for iii in range(len(order_of_texts_tot)): - order_text_new.append(np.flatnonzero(np.array(order_of_texts_tot) == iii)) + order_of_texts_tot = order_by_con_main + order_text_new = np.argsort(order_of_texts_tot) except Exception as why: self.logger.error(why) - arg_text_con = [] + arg_text_con_main = np.zeros(len(contours_only_text_parent), dtype=int) for ii in range(len(contours_only_text_parent)): check_if_textregion_located_in_a_box = False for jj, box in enumerate(boxes): @@ -2811,7 +2789,7 @@ class Eynollah: cy_main[ii] >= box[2] and cy_main[ii] < box[3]): # this is valid if the center of region identify in which box it is located - arg_text_con.append(jj) + arg_text_con_main[ii] = jj check_if_textregion_located_in_a_box = True break if not check_if_textregion_located_in_a_box: @@ -2819,9 +2797,9 @@ class Eynollah: pcontained_in_box = ((boxes[:, 2] <= cy_main[ii]) & (cy_main[ii] < boxes[:, 3]) & (boxes[:, 0] <= cx_main[ii]) & (cx_main[ii] < boxes[:, 1])) ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box)) - arg_text_con[ii] = ind_min - args_contours = np.arange(len(contours_only_text_parent)) - order_by_con_main = np.zeros(len(arg_text_con)) + arg_text_con_main[ii] = ind_min + args_contours_main = np.arange(len(contours_only_text_parent)) + order_by_con_main = np.zeros_like(arg_text_con_main) ref_point = 0 order_of_texts_tot = [] @@ -2829,11 +2807,9 @@ class Eynollah: for iij, box in enumerate(boxes): ys = slice(*box[2:4]) xs = slice(*box[0:2]) - args_contours_box = args_contours[np.array(arg_text_con) == iij] - con_inter_box = [] + args_contours_box_main = args_contours_main[arg_text_con_main == iij] + con_inter_box = contours_only_text_parent[args_contours_box_main] con_inter_box_h = [] - for i in range(len(args_contours_box)): - con_inter_box.append(contours_only_text_parent[args_contours_box[i]]) indexes_sorted, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions( textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, box[2]) @@ -2845,9 +2821,9 @@ class Eynollah: indexes_sorted_main = indexes_sorted[kind_of_texts_sorted == 1] indexes_by_type_main = index_by_kind_sorted[kind_of_texts_sorted == 1] - for zahler, _ in enumerate(args_contours_box): + for zahler, _ in enumerate(args_contours_box_main): arg_order_v = indexes_sorted_main[zahler] - order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = \ + order_by_con_main[args_contours_box_main[indexes_by_type_main[zahler]]] = \ np.flatnonzero(indexes_sorted == arg_order_v) + ref_point for jji, _ in enumerate(id_of_texts): @@ -2855,14 +2831,8 @@ class Eynollah: id_of_texts_tot.append(id_of_texts[jji]) ref_point += len(id_of_texts) - order_of_texts_tot = [] - - for tj1 in range(len(contours_only_text_parent)): - order_of_texts_tot.append(int(order_by_con_main[tj1])) - - order_text_new = [] - for iii in range(len(order_of_texts_tot)): - order_text_new.append(np.flatnonzero(np.array(order_of_texts_tot) == iii)) + order_of_texts_tot = order_by_con_main + order_text_new = np.argsort(order_of_texts_tot) self.logger.debug("exit do_order_of_regions_no_full_layout") return order_text_new, id_of_texts_tot