do_order_of_regions: improve box matching, simplify

- when searching for boxes matching contour, be more precise:
  - avoid heuristic rules ("xmin + 80 within xrange") in favour
    of exact criteria (contour properly contained in box)
  - for fallback criterion (nearest centers), also require
    proper containment of center in box
- `order_of_regions`: remove (now) unnecessary (and insufficient)
  workaround for missing indexes (if boxes are not covering contours
  exactly)
This commit is contained in:
Robert Sachunsky 2025-10-02 22:35:40 +02:00
parent 4950e6bd78
commit 7387f5a929
2 changed files with 106 additions and 93 deletions

View file

@ -2518,51 +2518,59 @@ class Eynollah:
contours_only_text_parent = np.array(contours_only_text_parent) contours_only_text_parent = np.array(contours_only_text_parent)
contours_only_text_parent_h = np.array(contours_only_text_parent_h) contours_only_text_parent_h = np.array(contours_only_text_parent_h)
boxes = np.array(boxes, dtype=int) # to be on the safe side boxes = np.array(boxes, dtype=int) # to be on the safe side
cx_text_only, cy_text_only, x_min_text_only, _, _, _, y_cor_x_min_main = find_new_features_of_contours( c_boxes = np.stack((0.5 * boxes[:, 2:4].sum(axis=1),
0.5 * boxes[:, 0:2].sum(axis=1)))
cx_main, cy_main, mx_main, Mx_main, my_main, My_main, mxy_main = find_new_features_of_contours(
contours_only_text_parent) contours_only_text_parent)
cx_text_only_h, cy_text_only_h, x_min_text_only_h, _, _, _, y_cor_x_min_main_h = find_new_features_of_contours( cx_head, cy_head, mx_head, Mx_head, my_head, My_head, mxy_head = find_new_features_of_contours(
contours_only_text_parent_h) contours_only_text_parent_h)
try: try:
arg_text_con = [] arg_text_con = []
for ii in range(len(cx_text_only)): for ii in range(len(contours_only_text_parent)):
check_if_textregion_located_in_a_box = False check_if_textregion_located_in_a_box = False
for jj in range(len(boxes)): for jj, box in enumerate(boxes):
if (x_min_text_only[ii] + 80 >= boxes[jj][0] and if (mx_main[ii] >= box[0] and
x_min_text_only[ii] + 80 < boxes[jj][1] and Mx_main[ii] < box[1] and
y_cor_x_min_main[ii] >= boxes[jj][2] and my_main[ii] >= box[2] and
y_cor_x_min_main[ii] < boxes[jj][3]): My_main[ii] < box[3]):
arg_text_con.append(jj) arg_text_con.append(jj)
check_if_textregion_located_in_a_box = True check_if_textregion_located_in_a_box = True
break break
if not check_if_textregion_located_in_a_box: if not check_if_textregion_located_in_a_box:
dists_tr_from_box = [math.sqrt((cx_text_only[ii] - boxes[jj][1]) ** 2 + # dists_tr_from_box = [math.sqrt((cx_main[ii] - 0.5 * box[1] - 0.5 * box[0]) ** 2 +
(cy_text_only[ii] - boxes[jj][2]) ** 2) # (cy_main[ii] - 0.5 * box[3] - 0.5 * box[2]) ** 2)
for jj in range(len(boxes))] # for box in boxes]
ind_min = np.argmin(dists_tr_from_box) dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_main[ii]], [cx_main[ii]]]), axis=0)
pcontained_in_box = ((boxes[:, 2] <= cy_main[ii]) & (cy_main[ii] < boxes[:, 3]) &
(boxes[:, 0] <= cx_main[ii]) & (cx_main[ii] < boxes[:, 1]))
ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box))
arg_text_con.append(ind_min) arg_text_con.append(ind_min)
args_contours = np.array(range(len(arg_text_con))) args_contours = np.arange(len(arg_text_con))
order_by_con_main = np.zeros(len(arg_text_con))
arg_text_con_h = [] arg_text_con_h = []
for ii in range(len(cx_text_only_h)): for ii in range(len(contours_only_text_parent_h)):
check_if_textregion_located_in_a_box = False check_if_textregion_located_in_a_box = False
for jj in range(len(boxes)): for jj, box in enumerate(boxes):
if (x_min_text_only_h[ii] + 80 >= boxes[jj][0] and if (mx_head[ii] >= box[0] and
x_min_text_only_h[ii] + 80 < boxes[jj][1] and Mx_head[ii] < box[1] and
y_cor_x_min_main_h[ii] >= boxes[jj][2] and my_head[ii] >= box[2] and
y_cor_x_min_main_h[ii] < boxes[jj][3]): My_head[ii] < box[3]):
arg_text_con_h.append(jj) arg_text_con_h.append(jj)
check_if_textregion_located_in_a_box = True check_if_textregion_located_in_a_box = True
break break
if not check_if_textregion_located_in_a_box: if not check_if_textregion_located_in_a_box:
dists_tr_from_box = [math.sqrt((cx_text_only_h[ii] - boxes[jj][1]) ** 2 + # dists_tr_from_box = [math.sqrt((cx_head[ii] - 0.5 * box[1] - 0.5 * box[0]) ** 2 +
(cy_text_only_h[ii] - boxes[jj][2]) ** 2) # (cy_head[ii] - 0.5 * box[3] - 0.5 * box[2]) ** 2)
for jj in range(len(boxes))] # for box in boxes]
ind_min = np.argmin(dists_tr_from_box) dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_head[ii]], [cx_head[ii]]]), axis=0)
pcontained_in_box = ((boxes[:, 2] <= cy_head[ii]) & (cy_head[ii] < boxes[:, 3]) &
(boxes[:, 0] <= cx_head[ii]) & (cx_head[ii] < boxes[:, 1]))
ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box))
arg_text_con_h.append(ind_min) arg_text_con_h.append(ind_min)
args_contours_h = np.array(range(len(arg_text_con_h))) args_contours_h = np.arange(len(arg_text_con_h))
order_by_con_head = np.zeros(len(arg_text_con_h)) order_by_con_head = np.zeros(len(arg_text_con_h))
order_by_con_main = np.zeros(len(arg_text_con))
ref_point = 0 ref_point = 0
order_of_texts_tot = [] order_of_texts_tot = []
@ -2590,12 +2598,12 @@ class Eynollah:
for zahler, _ in enumerate(args_contours_box): for zahler, _ in enumerate(args_contours_box):
arg_order_v = indexes_sorted_main[zahler] arg_order_v = indexes_sorted_main[zahler]
order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = \ order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = \
np.where(indexes_sorted == arg_order_v)[0][0] + ref_point np.flatnonzero(indexes_sorted == arg_order_v) + ref_point
for zahler, _ in enumerate(args_contours_box_h): for zahler, _ in enumerate(args_contours_box_h):
arg_order_v = indexes_sorted_head[zahler] arg_order_v = indexes_sorted_head[zahler]
order_by_con_head[args_contours_box_h[indexes_by_type_head[zahler]]] = \ order_by_con_head[args_contours_box_h[indexes_by_type_head[zahler]]] = \
np.where(indexes_sorted == arg_order_v)[0][0] + ref_point np.flatnonzero(indexes_sorted == arg_order_v) + ref_point
for jji in range(len(id_of_texts)): for jji in range(len(id_of_texts)):
order_of_texts_tot.append(order_of_texts[jji] + ref_point) order_of_texts_tot.append(order_of_texts[jji] + ref_point)
@ -2611,53 +2619,59 @@ class Eynollah:
order_text_new = [] order_text_new = []
for iii in range(len(order_of_texts_tot)): for iii in range(len(order_of_texts_tot)):
order_text_new.append(np.where(np.array(order_of_texts_tot) == iii)[0][0]) order_text_new.append(np.flatnonzero(np.array(order_of_texts_tot) == iii))
except Exception as why: except Exception as why:
self.logger.error(why) self.logger.error(why)
arg_text_con = [] arg_text_con = []
for ii in range(len(cx_text_only)): for ii in range(len(contours_only_text_parent)):
check_if_textregion_located_in_a_box = False check_if_textregion_located_in_a_box = False
for jj in range(len(boxes)): for jj, box in enumerate(boxes):
if (cx_text_only[ii] >= boxes[jj][0] and if (cx_main[ii] >= box[0] and
cx_text_only[ii] < boxes[jj][1] and cx_main[ii] < box[1] and
cy_text_only[ii] >= boxes[jj][2] and cy_main[ii] >= box[2] and
cy_text_only[ii] < boxes[jj][3]): cy_main[ii] < box[3]):
# this is valid if the center of region identify in which box it is located # this is valid if the center of region identify in which box it is located
arg_text_con.append(jj) arg_text_con.append(jj)
check_if_textregion_located_in_a_box = True check_if_textregion_located_in_a_box = True
break break
if not check_if_textregion_located_in_a_box: if not check_if_textregion_located_in_a_box:
dists_tr_from_box = [math.sqrt((cx_text_only[ii] - boxes[jj][1]) ** 2 + # dists_tr_from_box = [math.sqrt((cx_main[ii] - 0.5 * box[1] - 0.5 * box[0]) ** 2 +
(cy_text_only[ii] - boxes[jj][2]) ** 2) # (cy_main[ii] - 0.5 * box[3] - 0.5 * box[2]) ** 2)
for jj in range(len(boxes))] # for box in boxes]
ind_min = np.argmin(dists_tr_from_box) dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_main[ii]], [cx_main[ii]]]), axis=0)
pcontained_in_box = ((boxes[:, 2] <= cy_main[ii]) & (cy_main[ii] < boxes[:, 3]) &
(boxes[:, 0] <= cx_main[ii]) & (cx_main[ii] < boxes[:, 1]))
ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box))
arg_text_con.append(ind_min) arg_text_con.append(ind_min)
args_contours = np.array(range(len(arg_text_con))) args_contours = np.arange(len(arg_text_con))
order_by_con_main = np.zeros(len(arg_text_con)) order_by_con_main = np.zeros(len(arg_text_con))
############################# head ############################# head
arg_text_con_h = [] arg_text_con_h = []
for ii in range(len(cx_text_only_h)): for ii in range(len(contours_only_text_parent_h)):
check_if_textregion_located_in_a_box = False check_if_textregion_located_in_a_box = False
for jj in range(len(boxes)): for jj, box in enumerate(boxes):
if (cx_text_only_h[ii] >= boxes[jj][0] and if (cx_head[ii] >= box[0] and
cx_text_only_h[ii] < boxes[jj][1] and cx_head[ii] < box[1] and
cy_text_only_h[ii] >= boxes[jj][2] and cy_head[ii] >= box[2] and
cy_text_only_h[ii] < boxes[jj][3]): cy_head[ii] < box[3]):
# this is valid if the center of region identify in which box it is located # this is valid if the center of region identify in which box it is located
arg_text_con_h.append(jj) arg_text_con_h.append(jj)
check_if_textregion_located_in_a_box = True check_if_textregion_located_in_a_box = True
break break
if not check_if_textregion_located_in_a_box: if not check_if_textregion_located_in_a_box:
dists_tr_from_box = [math.sqrt((cx_text_only_h[ii] - boxes[jj][1]) ** 2 + # dists_tr_from_box = [math.sqrt((cx_head[ii] - 0.5 * box[1] - 0.5 * box[0]) ** 2 +
(cy_text_only_h[ii] - boxes[jj][2]) ** 2) # (cy_head[ii] - 0.5 * box[3] - 0.5 * box[2]) ** 2)
for jj in range(len(boxes))] # for box in boxes]
ind_min = np.argmin(dists_tr_from_box) dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_head[ii]], [cx_head[ii]]]), axis=0)
pcontained_in_box = ((boxes[:, 2] <= cy_head[ii]) & (cy_head[ii] < boxes[:, 3]) &
(boxes[:, 0] <= cx_head[ii]) & (cx_head[ii] < boxes[:, 1]))
ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box))
arg_text_con_h.append(ind_min) arg_text_con_h.append(ind_min)
args_contours_h = np.array(range(len(arg_text_con_h))) args_contours_h = np.arange(len(arg_text_con_h))
order_by_con_head = np.zeros(len(arg_text_con_h)) order_by_con_head = np.zeros(len(arg_text_con_h))
ref_point = 0 ref_point = 0
@ -2686,14 +2700,14 @@ class Eynollah:
for zahler, _ in enumerate(args_contours_box): for zahler, _ in enumerate(args_contours_box):
arg_order_v = indexes_sorted_main[zahler] arg_order_v = indexes_sorted_main[zahler]
order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = \ order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = \
np.where(indexes_sorted == arg_order_v)[0][0] + ref_point np.flatnonzero(indexes_sorted == arg_order_v) + ref_point
for zahler, _ in enumerate(args_contours_box_h): for zahler, _ in enumerate(args_contours_box_h):
arg_order_v = indexes_sorted_head[zahler] arg_order_v = indexes_sorted_head[zahler]
order_by_con_head[args_contours_box_h[indexes_by_type_head[zahler]]] = \ order_by_con_head[args_contours_box_h[indexes_by_type_head[zahler]]] = \
np.where(indexes_sorted == arg_order_v)[0][0] + ref_point np.flatnonzero(indexes_sorted == arg_order_v) + ref_point
for jji, _ in enumerate(id_of_texts): for jji in range(len(id_of_texts)):
order_of_texts_tot.append(order_of_texts[jji] + ref_point) order_of_texts_tot.append(order_of_texts[jji] + ref_point)
id_of_texts_tot.append(id_of_texts[jji]) id_of_texts_tot.append(id_of_texts[jji])
ref_point += len(id_of_texts) ref_point += len(id_of_texts)
@ -2707,7 +2721,7 @@ class Eynollah:
order_text_new = [] order_text_new = []
for iii in range(len(order_of_texts_tot)): for iii in range(len(order_of_texts_tot)):
order_text_new.append(np.where(np.array(order_of_texts_tot) == iii)[0][0]) order_text_new.append(np.flatnonzero(np.array(order_of_texts_tot) == iii))
self.logger.debug("exit do_order_of_regions_full_layout") self.logger.debug("exit do_order_of_regions_full_layout")
return order_text_new, id_of_texts_tot return order_text_new, id_of_texts_tot
@ -2719,28 +2733,33 @@ class Eynollah:
contours_only_text_parent = np.array(contours_only_text_parent) contours_only_text_parent = np.array(contours_only_text_parent)
contours_only_text_parent_h = np.array(contours_only_text_parent_h) contours_only_text_parent_h = np.array(contours_only_text_parent_h)
boxes = np.array(boxes, dtype=int) # to be on the safe side boxes = np.array(boxes, dtype=int) # to be on the safe side
cx_text_only, cy_text_only, x_min_text_only, _, _, _, y_cor_x_min_main = find_new_features_of_contours( c_boxes = np.stack((0.5 * boxes[:, 2:4].sum(axis=1),
0.5 * boxes[:, 0:2].sum(axis=1)))
cx_main, cy_main, mx_main, Mx_main, my_main, My_main, mxy_main = find_new_features_of_contours(
contours_only_text_parent) contours_only_text_parent)
try: try:
arg_text_con = [] arg_text_con = []
for ii in range(len(cx_text_only)): for ii in range(len(contours_only_text_parent)):
check_if_textregion_located_in_a_box = False check_if_textregion_located_in_a_box = False
for jj in range(len(boxes)): for jj, box in enumerate(boxes):
if (x_min_text_only[ii] + 80 >= boxes[jj][0] and if (mx_main[ii] >= box[0] and
x_min_text_only[ii] + 80 < boxes[jj][1] and Mx_main[ii] < box[1] and
y_cor_x_min_main[ii] >= boxes[jj][2] and my_main[ii] >= box[2] and
y_cor_x_min_main[ii] < boxes[jj][3]): My_main[ii] < box[3]):
arg_text_con.append(jj) arg_text_con.append(jj)
check_if_textregion_located_in_a_box = True check_if_textregion_located_in_a_box = True
break break
if not check_if_textregion_located_in_a_box: if not check_if_textregion_located_in_a_box:
dists_tr_from_box = [math.sqrt((cx_text_only[ii] - boxes[jj][1]) ** 2 + # dists_tr_from_box = [math.sqrt((cx_main[ii] - 0.5 * box[1] - 0.5 * box[0]) ** 2 +
(cy_text_only[ii] - boxes[jj][2]) ** 2) # (cy_main[ii] - 0.5 * box[3] - 0.5 * box[2]) ** 2)
for jj in range(len(boxes))] # for box in boxes]
ind_min = np.argmin(dists_tr_from_box) dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_main[ii]], [cx_main[ii]]]), axis=0)
pcontained_in_box = ((boxes[:, 2] <= cy_main[ii]) & (cy_main[ii] < boxes[:, 3]) &
(boxes[:, 0] <= cx_main[ii]) & (cx_main[ii] < boxes[:, 1]))
ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box))
arg_text_con.append(ind_min) arg_text_con.append(ind_min)
args_contours = np.array(range(len(arg_text_con))) args_contours = np.arange(len(arg_text_con))
order_by_con_main = np.zeros(len(arg_text_con)) order_by_con_main = np.zeros(len(arg_text_con))
ref_point = 0 ref_point = 0
@ -2766,7 +2785,7 @@ class Eynollah:
for zahler, _ in enumerate(args_contours_box): for zahler, _ in enumerate(args_contours_box):
arg_order_v = indexes_sorted_main[zahler] arg_order_v = indexes_sorted_main[zahler]
order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = \ order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = \
np.where(indexes_sorted == arg_order_v)[0][0] + ref_point np.flatnonzero(indexes_sorted == arg_order_v) + ref_point
for jji, _ in enumerate(id_of_texts): for jji, _ in enumerate(id_of_texts):
order_of_texts_tot.append(order_of_texts[jji] + ref_point) order_of_texts_tot.append(order_of_texts[jji] + ref_point)
@ -2779,29 +2798,29 @@ class Eynollah:
order_text_new = [] order_text_new = []
for iii in range(len(order_of_texts_tot)): for iii in range(len(order_of_texts_tot)):
order_text_new.append(np.where(np.array(order_of_texts_tot) == iii)[0][0]) order_text_new.append(np.flatnonzero(np.array(order_of_texts_tot) == iii))
except Exception as why: except Exception as why:
self.logger.error(why) self.logger.error(why)
arg_text_con = [] arg_text_con = []
for ii in range(len(cx_text_only)): for ii in range(len(contours_only_text_parent)):
check_if_textregion_located_in_a_box = False check_if_textregion_located_in_a_box = False
for jj in range(len(boxes)): for jj, box in enumerate(boxes):
if (cx_text_only[ii] >= boxes[jj][0] and if (cx_main[ii] >= box[0] and
cx_text_only[ii] < boxes[jj][1] and cx_main[ii] < box[1] and
cy_text_only[ii] >= boxes[jj][2] and cy_main[ii] >= box[2] and
cy_text_only[ii] < boxes[jj][3]): cy_main[ii] < box[3]):
# this is valid if the center of region identify in which box it is located # this is valid if the center of region identify in which box it is located
arg_text_con.append(jj) arg_text_con.append(jj)
check_if_textregion_located_in_a_box = True check_if_textregion_located_in_a_box = True
break break
if not check_if_textregion_located_in_a_box: if not check_if_textregion_located_in_a_box:
dists_tr_from_box = [math.sqrt((cx_text_only[ii] - boxes[jj][1]) ** 2 + dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_main[ii]], [cx_main[ii]]]), axis=0)
(cy_text_only[ii] - boxes[jj][2]) ** 2) pcontained_in_box = ((boxes[:, 2] <= cy_main[ii]) & (cy_main[ii] < boxes[:, 3]) &
for jj in range(len(boxes))] (boxes[:, 0] <= cx_main[ii]) & (cx_main[ii] < boxes[:, 1]))
ind_min = np.argmin(dists_tr_from_box) ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box))
arg_text_con.append(ind_min) arg_text_con[ii] = ind_min
args_contours = np.array(range(len(arg_text_con))) args_contours = np.arange(len(contours_only_text_parent))
order_by_con_main = np.zeros(len(arg_text_con)) order_by_con_main = np.zeros(len(arg_text_con))
ref_point = 0 ref_point = 0
@ -2829,7 +2848,7 @@ class Eynollah:
for zahler, _ in enumerate(args_contours_box): for zahler, _ in enumerate(args_contours_box):
arg_order_v = indexes_sorted_main[zahler] arg_order_v = indexes_sorted_main[zahler]
order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = \ order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = \
np.where(indexes_sorted == arg_order_v)[0][0] + ref_point np.flatnonzero(indexes_sorted == arg_order_v) + ref_point
for jji, _ in enumerate(id_of_texts): for jji, _ in enumerate(id_of_texts):
order_of_texts_tot.append(order_of_texts[jji] + ref_point) order_of_texts_tot.append(order_of_texts[jji] + ref_point)
@ -2843,7 +2862,7 @@ class Eynollah:
order_text_new = [] order_text_new = []
for iii in range(len(order_of_texts_tot)): for iii in range(len(order_of_texts_tot)):
order_text_new.append(np.where(np.array(order_of_texts_tot) == iii)[0][0]) order_text_new.append(np.flatnonzero(np.array(order_of_texts_tot) == iii))
self.logger.debug("exit do_order_of_regions_no_full_layout") self.logger.debug("exit do_order_of_regions_no_full_layout")
return order_text_new, id_of_texts_tot return order_text_new, id_of_texts_tot

View file

@ -1222,6 +1222,8 @@ def order_of_regions(textline_mask, contours_main, contours_head, y_ref):
# offset from bbox of mask # offset from bbox of mask
peaks_neg_new += y_ref peaks_neg_new += y_ref
# assert not len(cy_main) or np.min(peaks_neg_new) <= np.min(cy_main) and np.max(cy_main) <= np.max(peaks_neg_new)
# assert not len(cy_head) or np.min(peaks_neg_new) <= np.min(cy_head) and np.max(cy_head) <= np.max(peaks_neg_new)
matrix_of_orders = np.zeros((len(contours_main) + len(contours_head), 5), dtype=int) matrix_of_orders = np.zeros((len(contours_main) + len(contours_head), 5), dtype=int)
matrix_of_orders[:, 0] = np.arange(len(contours_main) + len(contours_head)) matrix_of_orders[:, 0] = np.arange(len(contours_main) + len(contours_head))
@ -1251,16 +1253,8 @@ def order_of_regions(textline_mask, contours_main, contours_head, y_ref):
##matrix_of_orders[:len_main,4]=final_indexers_sorted[:] ##matrix_of_orders[:len_main,4]=final_indexers_sorted[:]
# This fix is applied if the sum of the lengths of contours and contours_h # assert len(final_indexers_sorted) == len(contours_main) + len(contours_head)
# does not match final_indexers_sorted. However, this is not the optimal solution.. # assert not len(final_indexers_sorted) or max(final_index_type) == max(len(contours_main)
if len(cy_main) + len(cy_header) == len(final_index_type):
pass
else:
indexes_missed = set(np.arange(len(cy_main) + len(cy_header))) - set(final_indexers_sorted)
for ind_missed in indexes_missed:
final_indexers_sorted.append(ind_missed)
final_types.append(1)
final_index_type.append(ind_missed)
return np.array(final_indexers_sorted), np.array(final_types), np.array(final_index_type) return np.array(final_indexers_sorted), np.array(final_types), np.array(final_index_type)