mirror of
https://github.com/qurator-spk/eynollah.git
synced 2025-10-27 07:44:12 +01:00
return_x_start_end_mothers_childs_and_type_of_reading_order:
simplify and document - simplify - rename identifiers to make readable: - `y_sep` → `y_mid` (because the cy gets passed) - `y_diff` → `y_max` (because the ymax gets passed) - array instead of list operations - add docstring and in-line comments - return (zero-length) numpy array instead of empty list
This commit is contained in:
parent
0fc4b2535d
commit
e2dfec75fb
2 changed files with 198 additions and 190 deletions
|
|
@ -2507,6 +2507,7 @@ class Eynollah:
|
||||||
My_main[ii] < box[3])):
|
My_main[ii] < box[3])):
|
||||||
arg_text_con_main[ii] = jj
|
arg_text_con_main[ii] = jj
|
||||||
check_if_textregion_located_in_a_box = True
|
check_if_textregion_located_in_a_box = True
|
||||||
|
#print("main/matched", (mx_main[ii], Mx_main[ii], my_main[ii], My_main[ii]), "\tin", box, only_centers)
|
||||||
break
|
break
|
||||||
if not check_if_textregion_located_in_a_box:
|
if not check_if_textregion_located_in_a_box:
|
||||||
dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_main[ii]], [cx_main[ii]]]), axis=0)
|
dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_main[ii]], [cx_main[ii]]]), axis=0)
|
||||||
|
|
@ -2514,6 +2515,7 @@ class Eynollah:
|
||||||
(boxes[:, 0] <= cx_main[ii]) & (cx_main[ii] < boxes[:, 1]))
|
(boxes[:, 0] <= cx_main[ii]) & (cx_main[ii] < boxes[:, 1]))
|
||||||
ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box))
|
ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box))
|
||||||
arg_text_con_main[ii] = ind_min
|
arg_text_con_main[ii] = ind_min
|
||||||
|
#print("main/fallback", (mx_main[ii], Mx_main[ii], my_main[ii], My_main[ii]), "\tin", boxes[ind_min], only_centers)
|
||||||
args_contours_main = np.arange(len(contours_only_text_parent))
|
args_contours_main = np.arange(len(contours_only_text_parent))
|
||||||
order_by_con_main = np.zeros_like(arg_text_con_main)
|
order_by_con_main = np.zeros_like(arg_text_con_main)
|
||||||
|
|
||||||
|
|
@ -2531,6 +2533,7 @@ class Eynollah:
|
||||||
My_head[ii] < box[3])):
|
My_head[ii] < box[3])):
|
||||||
arg_text_con_head[ii] = jj
|
arg_text_con_head[ii] = jj
|
||||||
check_if_textregion_located_in_a_box = True
|
check_if_textregion_located_in_a_box = True
|
||||||
|
#print("head/matched", (mx_head[ii], Mx_head[ii], my_head[ii], My_head[ii]), "\tin", box, only_centers)
|
||||||
break
|
break
|
||||||
if not check_if_textregion_located_in_a_box:
|
if not check_if_textregion_located_in_a_box:
|
||||||
dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_head[ii]], [cx_head[ii]]]), axis=0)
|
dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_head[ii]], [cx_head[ii]]]), axis=0)
|
||||||
|
|
@ -2538,6 +2541,7 @@ class Eynollah:
|
||||||
(boxes[:, 0] <= cx_head[ii]) & (cx_head[ii] < boxes[:, 1]))
|
(boxes[:, 0] <= cx_head[ii]) & (cx_head[ii] < boxes[:, 1]))
|
||||||
ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box))
|
ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box))
|
||||||
arg_text_con_head[ii] = ind_min
|
arg_text_con_head[ii] = ind_min
|
||||||
|
#print("head/fallback", (mx_head[ii], Mx_head[ii], my_head[ii], My_head[ii]), "\tin", boxes[ind_min], only_centers)
|
||||||
args_contours_head = np.arange(len(contours_only_text_parent_h))
|
args_contours_head = np.arange(len(contours_only_text_parent_h))
|
||||||
order_by_con_head = np.zeros_like(arg_text_con_head)
|
order_by_con_head = np.zeros_like(arg_text_con_head)
|
||||||
|
|
||||||
|
|
@ -2587,7 +2591,7 @@ class Eynollah:
|
||||||
try:
|
try:
|
||||||
results = match_boxes(False)
|
results = match_boxes(False)
|
||||||
except Exception as why:
|
except Exception as why:
|
||||||
self.logger.error(why)
|
self.logger.exception(why)
|
||||||
results = match_boxes(True)
|
results = match_boxes(True)
|
||||||
|
|
||||||
self.logger.debug("exit do_order_of_regions")
|
self.logger.debug("exit do_order_of_regions")
|
||||||
|
|
@ -2976,7 +2980,7 @@ class Eynollah:
|
||||||
max(self.num_col_lower or num_col_classifier,
|
max(self.num_col_lower or num_col_classifier,
|
||||||
num_col_classifier))
|
num_col_classifier))
|
||||||
except Exception as why:
|
except Exception as why:
|
||||||
self.logger.error(why)
|
self.logger.exception(why)
|
||||||
num_col = None
|
num_col = None
|
||||||
#print("inside graphics 3 ", time.time() - t_in_gr)
|
#print("inside graphics 3 ", time.time() - t_in_gr)
|
||||||
return (num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines,
|
return (num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines,
|
||||||
|
|
@ -3044,7 +3048,7 @@ class Eynollah:
|
||||||
if not num_column_is_classified:
|
if not num_column_is_classified:
|
||||||
num_col_classifier = num_col + 1
|
num_col_classifier = num_col + 1
|
||||||
except Exception as why:
|
except Exception as why:
|
||||||
self.logger.error(why)
|
self.logger.exception(why)
|
||||||
num_col = None
|
num_col = None
|
||||||
return (num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines,
|
return (num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines,
|
||||||
text_regions_p_1, cont_page, table_prediction)
|
text_regions_p_1, cont_page, table_prediction)
|
||||||
|
|
|
||||||
|
|
@ -33,226 +33,229 @@ def pairwise(iterable):
|
||||||
a = b
|
a = b
|
||||||
|
|
||||||
def return_x_start_end_mothers_childs_and_type_of_reading_order(
|
def return_x_start_end_mothers_childs_and_type_of_reading_order(
|
||||||
x_min_hor_some, x_max_hor_some, cy_hor_some, peak_points, cy_hor_diff):
|
x_min_hor_some, x_max_hor_some, cy_hor_some, peak_points, y_max_hor_some):
|
||||||
|
"""
|
||||||
|
Analyse which separators overlap multiple column candidates,
|
||||||
|
and how they overlap each other.
|
||||||
|
|
||||||
|
Ignore separators not spanning multiple columns.
|
||||||
|
|
||||||
|
For the separators to be returned, try to join them when they are directly
|
||||||
|
adjacent horizontally but nearby vertically (and thus mutually compatible).
|
||||||
|
Also, mark any separators that already span the full width.
|
||||||
|
|
||||||
|
Furthermore, identify which pairs of (unjoined) separators span subsets of columns
|
||||||
|
of each other (disregarding vertical positions). Referring, respectively, to the
|
||||||
|
superset separators as "mothers" and to the subset separators as "children",
|
||||||
|
retrieve information on which columns are spanned by separators with no mother,
|
||||||
|
and which columns are spanned by their children (if any).
|
||||||
|
|
||||||
|
Moreover, determine if there is any (column) overlap among the multi-span separators
|
||||||
|
with no mother, specifically (and thus, no simple box separation is possible).
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
* the x start column index of the raw separators
|
||||||
|
* the x end column index of the raw separators
|
||||||
|
* the y center coordinate of the raw separators
|
||||||
|
* the x column coordinates
|
||||||
|
* the y end coordinate of the raw separators
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
a tuple of:
|
||||||
|
* whether any top-level (no-mother) multi-span separators overlap each other
|
||||||
|
* the x start column index of the resulting multi-span separators
|
||||||
|
* the x end column index of the resulting multi-span separators
|
||||||
|
* the y center coordinate of the resulting multi-span separators
|
||||||
|
* the y end coordinate of the resulting multi-span separators
|
||||||
|
* the y center (for 1 representative) of the top-level (no-mother) multi-span separators
|
||||||
|
* the x start column index of the top-level (no-mother) multi-span separators
|
||||||
|
* the x end column index of the top-level (no-mother) multi-span separators
|
||||||
|
* whether any multi-span separators have super-spans of other (child) multi-span separators
|
||||||
|
* the y center (for 1 representative) of the top-level (no-mother) multi-span separators
|
||||||
|
which have super-spans of other (child) multi-span separators
|
||||||
|
* the x start column index of the top-level multi-span separators
|
||||||
|
which have super-spans of other (child) multi-span separators
|
||||||
|
* the x end column index of the top-level multi-span separators
|
||||||
|
which have super-spans of other (child) multi-span separators
|
||||||
|
* indexes of multi-span separators with full-width span
|
||||||
|
"""
|
||||||
|
|
||||||
x_start=[]
|
x_start=[]
|
||||||
x_end=[]
|
x_end=[]
|
||||||
kind=[]#if covers 2 and more than 2 columns set it to 1 otherwise 0
|
|
||||||
len_sep=[]
|
len_sep=[]
|
||||||
y_sep=[]
|
y_mid=[]
|
||||||
y_diff=[]
|
y_max=[]
|
||||||
new_main_sep_y=[]
|
new_main_sep_y=[]
|
||||||
|
|
||||||
indexer=0
|
indexer=0
|
||||||
for i in range(len(x_min_hor_some)):
|
for i in range(len(x_min_hor_some)):
|
||||||
|
#print(indexer, "%d:%d" % (x_min_hor_some[i], x_max_hor_some[i]), cy_hor_some[i])
|
||||||
starting = x_min_hor_some[i] - peak_points
|
starting = x_min_hor_some[i] - peak_points
|
||||||
starting=starting[starting>=0]
|
min_start = np.flatnonzero(starting >= 0)[-1] # last left-of
|
||||||
min_start=np.argmin(starting)
|
ending = x_max_hor_some[i] - peak_points
|
||||||
ending=peak_points-x_max_hor_some[i]
|
max_end = np.flatnonzero(ending < 0)[0] # first right-of
|
||||||
len_ending_neg=len(ending[ending<=0])
|
#print(indexer, "%d:%d" % (min_start, max_end))
|
||||||
|
|
||||||
ending=ending[ending>0]
|
|
||||||
max_end=np.argmin(ending)+len_ending_neg
|
|
||||||
|
|
||||||
if (max_end-min_start)>=2:
|
if (max_end-min_start)>=2:
|
||||||
|
# column range of separator spans more than one column candidate
|
||||||
if (max_end-min_start)==(len(peak_points)-1):
|
if (max_end-min_start)==(len(peak_points)-1):
|
||||||
|
# all columns (i.e. could be true new y splitter)
|
||||||
new_main_sep_y.append(indexer)
|
new_main_sep_y.append(indexer)
|
||||||
|
|
||||||
#print((max_end-min_start),len(peak_points),'(max_end-min_start)')
|
#print((max_end-min_start),len(peak_points),'(max_end-min_start)')
|
||||||
y_sep.append(cy_hor_some[i])
|
y_mid.append(cy_hor_some[i])
|
||||||
y_diff.append(cy_hor_diff[i])
|
y_max.append(y_max_hor_some[i])
|
||||||
x_end.append(max_end)
|
x_end.append(max_end)
|
||||||
|
|
||||||
x_start.append(min_start)
|
x_start.append(min_start)
|
||||||
|
|
||||||
len_sep.append(max_end-min_start)
|
len_sep.append(max_end-min_start)
|
||||||
if max_end==min_start+1:
|
|
||||||
kind.append(0)
|
|
||||||
else:
|
|
||||||
kind.append(1)
|
|
||||||
|
|
||||||
indexer+=1
|
indexer+=1
|
||||||
|
#print(x_start,'x_start')
|
||||||
|
#print(x_end,'x_end')
|
||||||
|
|
||||||
x_start_returned = np.array(x_start, dtype=int)
|
x_start_returned = np.array(x_start, dtype=int)
|
||||||
x_end_returned = np.array(x_end, dtype=int)
|
x_end_returned = np.array(x_end, dtype=int)
|
||||||
y_sep_returned = np.array(y_sep, dtype=int)
|
y_mid_returned = np.array(y_mid, dtype=int)
|
||||||
y_diff_returned = np.array(y_diff, dtype=int)
|
y_max_returned = np.array(y_max, dtype=int)
|
||||||
|
#print(y_mid_returned,'y_mid_returned')
|
||||||
all_args_uniq = contours_in_same_horizon(y_sep_returned)
|
|
||||||
args_to_be_unified=[]
|
|
||||||
y_unified=[]
|
|
||||||
y_diff_unified=[]
|
|
||||||
x_s_unified=[]
|
|
||||||
x_e_unified=[]
|
|
||||||
if len(all_args_uniq)>0:
|
|
||||||
#print('burda')
|
|
||||||
if type(all_args_uniq[0]) is list:
|
|
||||||
for dd in range(len(all_args_uniq)):
|
|
||||||
if len(all_args_uniq[dd])==2:
|
|
||||||
x_s_same_hor=np.array(x_start_returned)[all_args_uniq[dd]]
|
|
||||||
x_e_same_hor=np.array(x_end_returned)[all_args_uniq[dd]]
|
|
||||||
y_sep_same_hor=np.array(y_sep_returned)[all_args_uniq[dd]]
|
|
||||||
y_diff_same_hor=np.array(y_diff_returned)[all_args_uniq[dd]]
|
|
||||||
#print('burda2')
|
|
||||||
if (x_s_same_hor[0]==x_e_same_hor[1]-1 or
|
|
||||||
x_s_same_hor[1]==x_e_same_hor[0]-1 and
|
|
||||||
x_s_same_hor[0]!=x_s_same_hor[1] and
|
|
||||||
x_e_same_hor[0]!=x_e_same_hor[1]):
|
|
||||||
#print('burda3')
|
|
||||||
for arg_in in all_args_uniq[dd]:
|
|
||||||
#print(arg_in,'arg_in')
|
|
||||||
args_to_be_unified.append(arg_in)
|
|
||||||
y_selected=np.min(y_sep_same_hor)
|
|
||||||
y_diff_selected=np.max(y_diff_same_hor)
|
|
||||||
x_s_selected=np.min(x_s_same_hor)
|
|
||||||
x_e_selected=np.max(x_e_same_hor)
|
|
||||||
|
|
||||||
x_s_unified.append(x_s_selected)
|
|
||||||
x_e_unified.append(x_e_selected)
|
|
||||||
y_unified.append(y_selected)
|
|
||||||
y_diff_unified.append(y_diff_selected)
|
|
||||||
#print(x_s_same_hor,'x_s_same_hor')
|
|
||||||
#print(x_e_same_hor[:]-1,'x_e_same_hor')
|
|
||||||
#print('#############################')
|
|
||||||
#print(x_s_unified,'y_selected')
|
|
||||||
#print(x_e_unified,'x_s_selected')
|
|
||||||
#print(y_unified,'x_e_same_hor')
|
|
||||||
|
|
||||||
args_lines_not_unified=list( set(range(len(y_sep_returned)))-set(args_to_be_unified) )
|
|
||||||
#print(args_lines_not_unified,'args_lines_not_unified')
|
|
||||||
|
|
||||||
x_start_returned_not_unified=list( np.array(x_start_returned)[args_lines_not_unified] )
|
|
||||||
x_end_returned_not_unified=list( np.array(x_end_returned)[args_lines_not_unified] )
|
|
||||||
y_sep_returned_not_unified=list (np.array(y_sep_returned)[args_lines_not_unified] )
|
|
||||||
y_diff_returned_not_unified=list (np.array(y_diff_returned)[args_lines_not_unified] )
|
|
||||||
|
|
||||||
for dv in range(len(y_unified)):
|
|
||||||
y_sep_returned_not_unified.append(y_unified[dv])
|
|
||||||
y_diff_returned_not_unified.append(y_diff_unified[dv])
|
|
||||||
x_start_returned_not_unified.append(x_s_unified[dv])
|
|
||||||
x_end_returned_not_unified.append(x_e_unified[dv])
|
|
||||||
|
|
||||||
#print(y_sep_returned,'y_sep_returned')
|
|
||||||
#print(x_start_returned,'x_start_returned')
|
#print(x_start_returned,'x_start_returned')
|
||||||
#print(x_end_returned,'x_end_returned')
|
#print(x_end_returned,'x_end_returned')
|
||||||
|
|
||||||
x_start_returned = np.array(x_start_returned_not_unified, dtype=int)
|
# join/elongate separators if follow-up x and similar y
|
||||||
x_end_returned = np.array(x_end_returned_not_unified, dtype=int)
|
sep_pairs = contours_in_same_horizon(y_mid_returned)
|
||||||
y_sep_returned = np.array(y_sep_returned_not_unified, dtype=int)
|
if len(sep_pairs):
|
||||||
y_diff_returned = np.array(y_diff_returned_not_unified, dtype=int)
|
#print('burda')
|
||||||
|
args_to_be_unified = set()
|
||||||
|
y_mid_unified = []
|
||||||
|
y_max_unified = []
|
||||||
|
x_start_unified = []
|
||||||
|
x_end_unified = []
|
||||||
|
for pair in sep_pairs:
|
||||||
|
if (not np.array_equal(*x_start_returned[pair]) and
|
||||||
|
not np.array_equal(*x_end_returned[pair]) and
|
||||||
|
# immediately adjacent columns?
|
||||||
|
np.diff(x_end_returned[pair] -
|
||||||
|
x_start_returned[pair])[0] in [1, -1]):
|
||||||
|
|
||||||
#print(y_sep_returned,'y_sep_returned2')
|
args_to_be_unified.union(set(pair))
|
||||||
|
y_mid_unified.append(np.min(y_mid_returned[pair]))
|
||||||
|
y_max_unified.append(np.max(y_max_returned[pair]))
|
||||||
|
x_start_unified.append(np.min(x_start_returned[pair]))
|
||||||
|
x_end_unified.append(np.max(x_end_returned[pair]))
|
||||||
|
#print(pair,'pair')
|
||||||
|
#print(x_start_returned[pair],'x_s_same_hor')
|
||||||
|
#print(x_end_returned[pair],'x_e_same_hor')
|
||||||
|
#print(y_mid_unified,'y_mid_unified')
|
||||||
|
#print(y_max_unified,'y_max_unified')
|
||||||
|
#print(x_start_unified,'x_s_unified')
|
||||||
|
#print(x_end_unified,'x_e_selected')
|
||||||
|
#print('#############################')
|
||||||
|
|
||||||
|
if len(y_mid_unified):
|
||||||
|
args_lines_not_unified = np.setdiff1d(np.arange(len(y_mid_returned)),
|
||||||
|
list(args_to_be_unified), assume_unique=True)
|
||||||
|
#print(args_lines_not_unified,'args_lines_not_unified')
|
||||||
|
x_start_returned = np.append(x_start_returned[args_lines_not_unified],
|
||||||
|
x_start_unified, axis=0)
|
||||||
|
x_end_returned = np.append(x_end_returned[args_lines_not_unified],
|
||||||
|
x_end_unified, axis=0)
|
||||||
|
y_mid_returned = np.append(y_mid_returned[args_lines_not_unified],
|
||||||
|
y_mid_unified, axis=0)
|
||||||
|
y_max_returned = np.append(y_max_returned[args_lines_not_unified],
|
||||||
|
y_max_unified, axis=0)
|
||||||
|
#print(y_mid_returned,'y_mid_returned2')
|
||||||
#print(x_start_returned,'x_start_returned2')
|
#print(x_start_returned,'x_start_returned2')
|
||||||
#print(x_end_returned,'x_end_returned2')
|
#print(x_end_returned,'x_end_returned2')
|
||||||
#print(new_main_sep_y,'new_main_sep_y')
|
|
||||||
|
|
||||||
|
#print(new_main_sep_y,'new_main_sep_y')
|
||||||
#print(x_start,'x_start')
|
#print(x_start,'x_start')
|
||||||
#print(x_end,'x_end')
|
#print(x_end,'x_end')
|
||||||
if len(new_main_sep_y)>0:
|
|
||||||
|
|
||||||
min_ys=np.min(y_sep)
|
|
||||||
max_ys=np.max(y_sep)
|
|
||||||
|
|
||||||
y_mains=[]
|
|
||||||
y_mains.append(min_ys)
|
|
||||||
y_mains_sep_ohne_grenzen=[]
|
|
||||||
|
|
||||||
for ii in range(len(new_main_sep_y)):
|
|
||||||
y_mains.append(y_sep[new_main_sep_y[ii]])
|
|
||||||
y_mains_sep_ohne_grenzen.append(y_sep[new_main_sep_y[ii]])
|
|
||||||
|
|
||||||
y_mains.append(max_ys)
|
|
||||||
|
|
||||||
y_mains_sorted=np.sort(y_mains)
|
|
||||||
diff=np.diff(y_mains_sorted)
|
|
||||||
argm=np.argmax(diff)
|
|
||||||
|
|
||||||
y_min_new=y_mains_sorted[argm]
|
|
||||||
y_max_new=y_mains_sorted[argm+1]
|
|
||||||
|
|
||||||
#print(y_min_new,'y_min_new')
|
|
||||||
#print(y_max_new,'y_max_new')
|
|
||||||
#print(y_sep[new_main_sep_y[0]],y_sep,'yseps')
|
|
||||||
x_start = np.array(x_start)
|
x_start = np.array(x_start)
|
||||||
x_end = np.array(x_end)
|
x_end = np.array(x_end)
|
||||||
kind=np.array(kind)
|
y_mid = np.array(y_mid)
|
||||||
y_sep=np.array(y_sep)
|
if len(new_main_sep_y):
|
||||||
if (y_min_new in y_mains_sep_ohne_grenzen and
|
# some full-width multi-span separators exist, so
|
||||||
y_max_new in y_mains_sep_ohne_grenzen):
|
# restrict the y range of separators to search for
|
||||||
x_start=x_start[(y_sep>y_min_new) & (y_sep<y_max_new)]
|
# mutual overlaps to only those within the largest
|
||||||
x_end=x_end[(y_sep>y_min_new) & (y_sep<y_max_new)]
|
# y strip between adjacent multi-span separators
|
||||||
kind=kind[(y_sep>y_min_new) & (y_sep<y_max_new)]
|
# that involve at least one such full-width seps.
|
||||||
y_sep=y_sep[(y_sep>y_min_new) & (y_sep<y_max_new)]
|
# (does not affect the separators to be returned)
|
||||||
elif (y_min_new in y_mains_sep_ohne_grenzen and
|
min_ys=np.min(y_mid)
|
||||||
y_max_new not in y_mains_sep_ohne_grenzen):
|
max_ys=np.max(y_mid)
|
||||||
#print('burda')
|
#print(min_ys,'min_ys')
|
||||||
x_start=x_start[(y_sep>y_min_new) & (y_sep<=y_max_new)]
|
#print(max_ys,'max_ys')
|
||||||
#print('burda1')
|
|
||||||
x_end=x_end[(y_sep>y_min_new) & (y_sep<=y_max_new)]
|
y_mains0 = list(y_mid[new_main_sep_y])
|
||||||
#print('burda2')
|
y_mains = [min_ys] + y_mains0 + [max_ys]
|
||||||
kind=kind[(y_sep>y_min_new) & (y_sep<=y_max_new)]
|
|
||||||
y_sep=y_sep[(y_sep>y_min_new) & (y_sep<=y_max_new)]
|
y_mains = np.sort(y_mains)
|
||||||
elif (y_min_new not in y_mains_sep_ohne_grenzen and
|
argm = np.argmax(np.diff(y_mains))
|
||||||
y_max_new in y_mains_sep_ohne_grenzen):
|
y_mid_new = y_mains[argm]
|
||||||
x_start=x_start[(y_sep>=y_min_new) & (y_sep<y_max_new)]
|
y_mid_next_new = y_mains[argm + 1]
|
||||||
x_end=x_end[(y_sep>=y_min_new) & (y_sep<y_max_new)]
|
|
||||||
kind=kind[(y_sep>=y_min_new) & (y_sep<y_max_new)]
|
#print(y_mid_new,argm,'y_mid_new')
|
||||||
y_sep=y_sep[(y_sep>=y_min_new) & (y_sep<y_max_new)]
|
#print(y_mid_next_new,argm+1,'y_mid_next_new')
|
||||||
|
#print(y_mid[new_main_sep_y],new_main_sep_y,'yseps')
|
||||||
|
x_start=np.array(x_start)
|
||||||
|
x_end=np.array(x_end)
|
||||||
|
y_mid=np.array(y_mid)
|
||||||
|
# iff either boundary is itself not a full-width separator,
|
||||||
|
# then include it in the range of separators to be kept
|
||||||
|
if y_mid_new in y_mains0:
|
||||||
|
where = y_mid > y_mid_new
|
||||||
else:
|
else:
|
||||||
x_start=x_start[(y_sep>=y_min_new) & (y_sep<=y_max_new)]
|
where = y_mid >= y_mid_new
|
||||||
x_end=x_end[(y_sep>=y_min_new) & (y_sep<=y_max_new)]
|
if y_mid_next_new in y_mains0:
|
||||||
kind=kind[(y_sep>=y_min_new) & (y_sep<=y_max_new)]
|
where &= y_mid < y_mid_next_new
|
||||||
y_sep=y_sep[(y_sep>=y_min_new) & (y_sep<=y_max_new)]
|
else:
|
||||||
|
where &= y_mid <= y_mid_next_new
|
||||||
|
x_start = x_start[where]
|
||||||
|
x_end = x_end[where]
|
||||||
|
y_mid = y_mid[where]
|
||||||
#print(x_start,'x_start')
|
#print(x_start,'x_start')
|
||||||
#print(x_end,'x_end')
|
#print(x_end,'x_end')
|
||||||
#print(len_sep)
|
|
||||||
|
|
||||||
|
# remove redundant separators that span the same columns
|
||||||
|
# (keeping only 1 representative each)
|
||||||
deleted = set()
|
deleted = set()
|
||||||
for i in range(len(x_start)-1):
|
for index_i in range(len(x_start) - 1):
|
||||||
nodes_i=set(range(x_start[i],x_end[i]+1))
|
nodes_i = set(range(x_start[index_i], x_end[index_i] + 1))
|
||||||
for j in range(i+1,len(x_start)):
|
#print(nodes_i, "nodes_i")
|
||||||
if nodes_i==set(range(x_start[j],x_end[j]+1)):
|
for index_j in range(index_i + 1, len(x_start)):
|
||||||
deleted.add(j)
|
nodes_j = set(range(x_start[index_j], x_end[index_j] + 1))
|
||||||
#print(np.unique(deleted))
|
#print(nodes_j, "nodes_j")
|
||||||
|
if nodes_i == nodes_j:
|
||||||
|
deleted.add(index_j)
|
||||||
|
#print(deleted,"deleted")
|
||||||
remained_sep_indexes = set(range(len(x_start))) - deleted
|
remained_sep_indexes = set(range(len(x_start))) - deleted
|
||||||
#print(remained_sep_indexes,'remained_sep_indexes')
|
#print(remained_sep_indexes,'remained_sep_indexes')
|
||||||
mother=[]#if it has mother
|
|
||||||
child=[]
|
# determine which separators span which columns
|
||||||
|
mother = [] # whether the respective separator has a mother separator
|
||||||
|
child = [] # whether the respective separator has a child separator
|
||||||
for index_i in remained_sep_indexes:
|
for index_i in remained_sep_indexes:
|
||||||
have_mother=0
|
have_mother=0
|
||||||
have_child=0
|
have_child=0
|
||||||
nodes_ind=set(range(x_start[index_i],x_end[index_i]+1))
|
nodes_i = set(range(x_start[index_i], x_end[index_i] + 1))
|
||||||
for index_j in remained_sep_indexes:
|
for index_j in remained_sep_indexes:
|
||||||
nodes_ind_j=set(range(x_start[index_j],x_end[index_j]+1))
|
nodes_j = set(range(x_start[index_j], x_end[index_j] + 1))
|
||||||
if nodes_ind<nodes_ind_j:
|
if nodes_i < nodes_j:
|
||||||
have_mother=1
|
have_mother=1
|
||||||
if nodes_ind>nodes_ind_j:
|
if nodes_i > nodes_j:
|
||||||
have_child=1
|
have_child=1
|
||||||
mother.append(have_mother)
|
mother.append(have_mother)
|
||||||
child.append(have_child)
|
child.append(have_child)
|
||||||
|
#print(mother, "mother")
|
||||||
#print(mother,'mother')
|
#print(child, "child")
|
||||||
#print(len(remained_sep_indexes))
|
|
||||||
#print(len(remained_sep_indexes),len(x_start),len(x_end),len(y_sep),'lens')
|
|
||||||
y_lines_without_mother=[]
|
|
||||||
x_start_without_mother=[]
|
|
||||||
x_end_without_mother=[]
|
|
||||||
|
|
||||||
y_lines_with_child_without_mother=[]
|
|
||||||
x_start_with_child_without_mother=[]
|
|
||||||
x_end_with_child_without_mother=[]
|
|
||||||
|
|
||||||
mother = np.array(mother)
|
mother = np.array(mother)
|
||||||
child = np.array(child)
|
child = np.array(child)
|
||||||
#print(mother,'mother')
|
#print(mother,'mother')
|
||||||
#print(child,'child')
|
#print(child,'child')
|
||||||
remained_sep_indexes = np.array(list(remained_sep_indexes))
|
remained_sep_indexes = np.array(list(remained_sep_indexes))
|
||||||
x_start = np.array(x_start)
|
#print(len(remained_sep_indexes))
|
||||||
x_end = np.array(x_end)
|
#print(len(remained_sep_indexes),len(x_start),len(x_end),len(y_mid),'lens')
|
||||||
y_sep = np.array(y_sep)
|
|
||||||
|
|
||||||
if len(remained_sep_indexes)>1:
|
reading_order_type = 0
|
||||||
|
if len(remained_sep_indexes):
|
||||||
#print(np.array(remained_sep_indexes),'np.array(remained_sep_indexes)')
|
#print(np.array(remained_sep_indexes),'np.array(remained_sep_indexes)')
|
||||||
#print(np.array(mother),'mother')
|
#print(np.array(mother),'mother')
|
||||||
remained_sep_indexes_without_mother = remained_sep_indexes[mother==0]
|
remained_sep_indexes_without_mother = remained_sep_indexes[mother==0]
|
||||||
|
|
@ -262,52 +265,53 @@ def return_x_start_end_mothers_childs_and_type_of_reading_order(
|
||||||
|
|
||||||
x_end_with_child_without_mother = x_end[remained_sep_indexes_with_child_without_mother]
|
x_end_with_child_without_mother = x_end[remained_sep_indexes_with_child_without_mother]
|
||||||
x_start_with_child_without_mother = x_start[remained_sep_indexes_with_child_without_mother]
|
x_start_with_child_without_mother = x_start[remained_sep_indexes_with_child_without_mother]
|
||||||
y_lines_with_child_without_mother = y_sep[remained_sep_indexes_with_child_without_mother]
|
y_mid_with_child_without_mother = y_mid[remained_sep_indexes_with_child_without_mother]
|
||||||
|
|
||||||
reading_order_type=0
|
|
||||||
x_end_without_mother = x_end[remained_sep_indexes_without_mother]
|
x_end_without_mother = x_end[remained_sep_indexes_without_mother]
|
||||||
x_start_without_mother = x_start[remained_sep_indexes_without_mother]
|
x_start_without_mother = x_start[remained_sep_indexes_without_mother]
|
||||||
y_lines_without_mother = y_sep[remained_sep_indexes_without_mother]
|
y_mid_without_mother = y_mid[remained_sep_indexes_without_mother]
|
||||||
|
|
||||||
if len(remained_sep_indexes_without_mother)>=2:
|
if len(remained_sep_indexes_without_mother)>=2:
|
||||||
for i in range(len(remained_sep_indexes_without_mother)-1):
|
for i in range(len(remained_sep_indexes_without_mother)-1):
|
||||||
nodes_i=set(range(x_start[remained_sep_indexes_without_mother[i]],
|
index_i = remained_sep_indexes_without_mother[i]
|
||||||
x_end[remained_sep_indexes_without_mother[i]]
|
nodes_i = set(range(x_start[index_i], x_end[index_i])) # + 1
|
||||||
# + 1
|
#print(index_i, nodes_i, "nodes_i without mother")
|
||||||
))
|
|
||||||
for j in range(i + 1, len(remained_sep_indexes_without_mother)):
|
for j in range(i + 1, len(remained_sep_indexes_without_mother)):
|
||||||
nodes_j=set(range(x_start[remained_sep_indexes_without_mother[j]],
|
index_j = remained_sep_indexes_without_mother[j]
|
||||||
x_end[remained_sep_indexes_without_mother[j]]
|
nodes_j = set(range(x_start[index_j], x_end[index_j])) # + 1
|
||||||
# + 1
|
#print(index_j, nodes_j, "nodes_j without mother")
|
||||||
))
|
|
||||||
if nodes_i - nodes_j != nodes_i:
|
if nodes_i - nodes_j != nodes_i:
|
||||||
|
#print("type=1")
|
||||||
reading_order_type = 1
|
reading_order_type = 1
|
||||||
else:
|
else:
|
||||||
reading_order_type = 0
|
y_mid_without_mother = np.zeros(0, int)
|
||||||
#print(reading_order_type,'javab')
|
x_start_without_mother = np.zeros(0, int)
|
||||||
#print(y_lines_with_child_without_mother,'y_lines_with_child_without_mother')
|
x_end_without_mother = np.zeros(0, int)
|
||||||
|
y_mid_with_child_without_mother = np.zeros(0, int)
|
||||||
|
x_start_with_child_without_mother = np.zeros(0, int)
|
||||||
|
x_end_with_child_without_mother = np.zeros(0, int)
|
||||||
|
|
||||||
|
#print(reading_order_type,'reading_order_type')
|
||||||
|
#print(y_mid_with_child_without_mother,'y_mid_with_child_without_mother')
|
||||||
#print(x_start_with_child_without_mother,'x_start_with_child_without_mother')
|
#print(x_start_with_child_without_mother,'x_start_with_child_without_mother')
|
||||||
#print(x_end_with_child_without_mother,'x_end_with_hild_without_mother')
|
#print(x_end_with_child_without_mother,'x_end_with_hild_without_mother')
|
||||||
|
|
||||||
len_sep_with_child = len(child[child==1])
|
len_sep_with_child = len(child[child==1])
|
||||||
|
|
||||||
#print(len_sep_with_child,'len_sep_with_child')
|
#print(len_sep_with_child,'len_sep_with_child')
|
||||||
there_is_sep_with_child = 0
|
there_is_sep_with_child = 0
|
||||||
if len_sep_with_child >= 1:
|
if len_sep_with_child >= 1:
|
||||||
there_is_sep_with_child = 1
|
there_is_sep_with_child = 1
|
||||||
#print(all_args_uniq,'all_args_uniq')
|
|
||||||
#print(args_to_be_unified,'args_to_be_unified')
|
|
||||||
|
|
||||||
return (reading_order_type,
|
return (reading_order_type,
|
||||||
x_start_returned,
|
x_start_returned,
|
||||||
x_end_returned,
|
x_end_returned,
|
||||||
y_sep_returned,
|
y_mid_returned,
|
||||||
y_diff_returned,
|
y_max_returned,
|
||||||
y_lines_without_mother,
|
y_mid_without_mother,
|
||||||
x_start_without_mother,
|
x_start_without_mother,
|
||||||
x_end_without_mother,
|
x_end_without_mother,
|
||||||
there_is_sep_with_child,
|
there_is_sep_with_child,
|
||||||
y_lines_with_child_without_mother,
|
y_mid_with_child_without_mother,
|
||||||
x_start_with_child_without_mother,
|
x_start_with_child_without_mother,
|
||||||
x_end_with_child_without_mother,
|
x_end_with_child_without_mother,
|
||||||
new_main_sep_y)
|
new_main_sep_y)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue