mirror of
https://github.com/qurator-spk/eynollah.git
synced 2025-10-26 23:34:13 +01:00
reading order: improve handling of headings and horizontal seps
- drop connected components analysis to test overlaps between horizontal separators and (horizontal) neighbours (introduced in ab17a927) - instead of converting headings to topline and baseline during `find_number_of_columns_in_document` (introduced in 9f1595d7), add them to the matrix unchanged, but mark as extra type (besides horizontal and vertical separtors) - convert headings to toplines and baselines no earlier than in `return_boxes_of_images_by_order_of_reading_new` - for both headings and horizontal separators, if they already span multiple columns, check if they would overlap (horizontal) neighbours by looking at successively larger (left and right) intervals of columns (and pick the largest elongation which does not introduce any overlaps)
This commit is contained in:
parent
3367462d18
commit
19b2c3fa42
1 changed files with 80 additions and 47 deletions
|
|
@ -1387,8 +1387,6 @@ def return_points_with_boundies(peaks_neg_fin, first_point, last_point):
|
|||
return peaks_neg_tot
|
||||
|
||||
def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, label_seps, contours_h=None):
|
||||
ncomps, ccomps = cv2.connectedComponents(region_pre_p.astype(np.uint8))
|
||||
|
||||
separators_closeup = 1 * (region_pre_p == label_seps)
|
||||
separators_closeup[0:110] = 0
|
||||
separators_closeup[-150:] = 0
|
||||
|
|
@ -1414,14 +1412,6 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables,
|
|||
dist_ye = max_ye - min_ye
|
||||
if dist_ye <= 50 and dist_xe >= 3 * dist_ye:
|
||||
cnts_hor_e.append(cnt)
|
||||
labels = np.setdiff1d(np.unique(ccomps[med_ye]), [0])
|
||||
if len(labels) == 1:
|
||||
# mid line does not intersect with any other region
|
||||
# so add it as extra splitter line
|
||||
cnts_hor_e.append(np.array([[[0, med_ye]],
|
||||
[[ccomps.shape[1], med_ye]],
|
||||
[[ccomps.shape[1], med_ye + 1]],
|
||||
[[0, med_ye + 1]]]))
|
||||
|
||||
# delete horizontal contours (leaving only the edges)
|
||||
separators_closeup_n_binary = cv2.fillPoly(separators_closeup_n_binary, pts=cnts_hor_e, color=0)
|
||||
|
|
@ -1493,7 +1483,7 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables,
|
|||
slope_seps_org_hor=slope_seps_org_hor[dist_x_hor>=len_x/2.0]
|
||||
dist_x_hor=dist_x_hor[dist_x_hor>=len_x/2.0]
|
||||
|
||||
matrix_of_seps_ch=np.zeros((len(cy_seps_hor)+len(cx_seps_ver),10))
|
||||
matrix_of_seps_ch = np.zeros((len(cy_seps_hor)+len(cx_seps_ver), 10), dtype=int)
|
||||
matrix_of_seps_ch[:len(cy_seps_hor),0]=args_hor
|
||||
matrix_of_seps_ch[len(cy_seps_hor):,0]=args_ver
|
||||
matrix_of_seps_ch[len(cy_seps_hor):,1]=cx_seps_ver
|
||||
|
|
@ -1515,34 +1505,17 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables,
|
|||
if contours_h is not None:
|
||||
_, dist_x_head, x_min_head, x_max_head, cy_head, _, y_min_head, y_max_head, _ = \
|
||||
find_features_of_lines(contours_h)
|
||||
# matrix_l_n = np.zeros((len(cy_head), matrix_of_seps_ch.shape[1]))
|
||||
# args_head = np.arange(len(cy_head))
|
||||
# matrix_l_n[:, 0] = args_head
|
||||
# matrix_l_n[:, 2] = x_min_head+30
|
||||
# matrix_l_n[:, 3] = x_max_head-30
|
||||
# matrix_l_n[:, 4] = dist_x_head
|
||||
# matrix_l_n[:, 5] = y_min_head-3-8
|
||||
# matrix_l_n[:, 6] = y_min_head-5-8
|
||||
# matrix_l_n[:, 7] = y_max_head#y_min_head+1-8
|
||||
# matrix_l_n[:, 8] = 4
|
||||
# split at toplines (y_min_head) and baselines (y_max_head) instead of center (cy_head):
|
||||
cy_head = np.stack((y_min_head, y_max_head)).T.flatten()
|
||||
y_min_head, y_max_head = (np.stack((y_min_head - 2, y_max_head - 2)).T.flatten(),
|
||||
np.stack((y_min_head + 2, y_max_head + 2)).T.flatten())
|
||||
x_min_head = np.repeat(x_min_head, 2)
|
||||
x_max_head = np.repeat(x_max_head, 2)
|
||||
dist_x_head = np.repeat(dist_x_head, 2)
|
||||
matrix_l_n = np.zeros((len(cy_head), matrix_of_seps_ch.shape[1]))
|
||||
matrix_l_n = np.zeros((len(cy_head), matrix_of_seps_ch.shape[1]), dtype=int)
|
||||
args_head = np.arange(len(cy_head))
|
||||
matrix_l_n[:, 0] = args_head
|
||||
# +/- 30px to avoid crossing col peaks by accident
|
||||
matrix_l_n[:, 2] = x_min_head + 30
|
||||
matrix_l_n[:, 3] = x_max_head - 30
|
||||
matrix_l_n[:, 2] = x_min_head
|
||||
matrix_l_n[:, 3] = x_max_head
|
||||
matrix_l_n[:, 4] = dist_x_head
|
||||
matrix_l_n[:, 5] = cy_head
|
||||
matrix_l_n[:, 6] = y_min_head
|
||||
matrix_l_n[:, 7] = y_max_head
|
||||
matrix_l_n[:, 8] = 4
|
||||
matrix_l_n[:, 8] = y_max_head - y_min_head
|
||||
matrix_l_n[:, 9] = 2 # mark as heading (so it can be split into 2 horizontal separators as needed)
|
||||
matrix_of_seps_ch = np.append(
|
||||
matrix_of_seps_ch, matrix_l_n, axis=0)
|
||||
|
||||
|
|
@ -1551,9 +1524,12 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables,
|
|||
cy_seps_splitters = np.append(cy_seps_splitters, special_separators)
|
||||
|
||||
if contours_h is not None:
|
||||
cy_seps_splitters_head=cy_head[(x_min_head<=.16*region_pre_p.shape[1]) &
|
||||
y_min_splitters_head = y_min_head[(x_min_head<=.16*region_pre_p.shape[1]) &
|
||||
(x_max_head>=.84*region_pre_p.shape[1])]
|
||||
cy_seps_splitters = np.append(cy_seps_splitters, cy_seps_splitters_head)
|
||||
y_max_splitters_head = y_max_head[(x_min_head<=.16*region_pre_p.shape[1]) &
|
||||
(x_max_head>=.84*region_pre_p.shape[1])]
|
||||
cy_seps_splitters = np.append(cy_seps_splitters, y_min_splitters_head)
|
||||
cy_seps_splitters = np.append(cy_seps_splitters, y_max_splitters_head)
|
||||
|
||||
cy_seps_splitters = np.sort(cy_seps_splitters).astype(int)
|
||||
splitter_y_new = [0] + list(cy_seps_splitters) + [region_pre_p.shape[0]]
|
||||
|
|
@ -1713,17 +1689,6 @@ def return_boxes_of_images_by_order_of_reading_new(
|
|||
#num_col, peaks_neg_fin = find_num_col(
|
||||
# regions_without_separators[top:bot,:],
|
||||
# multiplier=7.0)
|
||||
x_min_hor_some=matrix_new[:,2][ (matrix_new[:,9]==0) ]
|
||||
x_max_hor_some=matrix_new[:,3][ (matrix_new[:,9]==0) ]
|
||||
cy_hor_some=matrix_new[:,5][ (matrix_new[:,9]==0) ]
|
||||
y_max_hor_some=matrix_new[:,7][ (matrix_new[:,9]==0) ]
|
||||
|
||||
if right2left_readingorder:
|
||||
x_max_hor_some_new = width_tot - x_min_hor_some
|
||||
x_min_hor_some_new = width_tot - x_max_hor_some
|
||||
x_min_hor_some =list(np.copy(x_min_hor_some_new))
|
||||
x_max_hor_some =list(np.copy(x_max_hor_some_new))
|
||||
|
||||
peaks_neg_tot = np.array([0] + peaks_neg_fin + [width_tot])
|
||||
#print(peaks_neg_tot,'peaks_neg_tot')
|
||||
peaks_neg_tot_tables.append(peaks_neg_tot)
|
||||
|
|
@ -1731,6 +1696,74 @@ def return_boxes_of_images_by_order_of_reading_new(
|
|||
all_columns = set(range(len(peaks_neg_tot) - 1))
|
||||
#print("all_columns", all_columns)
|
||||
|
||||
# elongate horizontal separators+headings as much as possible without overlap
|
||||
args_nonver = matrix_new[:, 9] != 1
|
||||
regions_with_separators = np.copy(regions_without_separators[top:bot])
|
||||
for xmin, xmax, ymin, ymax in matrix_new[:, [2, 3, 6, 7]]:
|
||||
regions_with_separators[ymin - top: ymax - top, xmin: xmax] = 6
|
||||
# def dbg_imshow(box, title):
|
||||
# xmin, xmax, ymin, ymax = box
|
||||
# plt.imshow(regions_with_separators, extent=[0, width_tot, bot, top])
|
||||
# plt.gca().add_patch(patches.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin,
|
||||
# fill=False, linewidth=1, edgecolor='r'))
|
||||
# plt.title(title + " at %d:%d, %d:%d" % (ymin, ymax, xmin, xmax))
|
||||
# plt.show()
|
||||
for i in np.flatnonzero(args_nonver):
|
||||
xmin, xmax, ymin, ymax, typ = matrix_new[i, [2, 3, 6, 7, 9]]
|
||||
cut = regions_with_separators[ymin - top: ymax - top]
|
||||
# dbg_imshow([xmin, xmax, ymin, ymax], "separator %d (%s)" % (i, "heading" if typ else "horizontal"))
|
||||
starting = xmin - peaks_neg_tot
|
||||
min_start = np.flatnonzero(starting >= 0)[-1] # last left-of
|
||||
ending = xmax - peaks_neg_tot
|
||||
max_end = np.flatnonzero(ending < 0)[0] # first right-of
|
||||
# skip elongation unless this is already a multi-column separator/heading:
|
||||
if not max_end - min_start > 1:
|
||||
continue
|
||||
# is there anything left of min_start?
|
||||
for j in range(min_start):
|
||||
# dbg_imshow([peaks_neg_tot[j], xmin, ymin, ymax], "start of %d candidate %d" % (i, j))
|
||||
if not np.any(cut[:, peaks_neg_tot[j]: xmin]):
|
||||
# print("elongated sep", i, "typ", typ, "start", xmin, "to", j, peaks_neg_tot[j])
|
||||
matrix_new[i, 2] = peaks_neg_tot[j] + 1 # elongate to start of this column
|
||||
break
|
||||
# is there anything right of max_end?
|
||||
for j in range(len(peaks_neg_tot) - 1, max_end, -1):
|
||||
# dbg_imshow([xmax, peaks_neg_tot[j], ymin, ymax], "end of %d candidate %d" % (i, j))
|
||||
if not np.any(cut[:, xmax: peaks_neg_tot[j]]):
|
||||
# print("elongated sep", i, "typ", typ, "end", xmax, "to", j, peaks_neg_tot[j])
|
||||
matrix_new[i, 3] = peaks_neg_tot[j] - 1 # elongate to end of this column
|
||||
break
|
||||
|
||||
args_hor = matrix_new[:, 9] == 0
|
||||
x_min_hor_some = matrix_new[:, 2][args_hor]
|
||||
x_max_hor_some = matrix_new[:, 3][args_hor]
|
||||
y_max_hor_some = matrix_new[:, 7][args_hor]
|
||||
cy_hor_some = matrix_new[:, 5][args_hor]
|
||||
|
||||
args_head = matrix_new[:, 9] == 2
|
||||
x_min_hor_head = matrix_new[:, 2][args_head]
|
||||
x_max_hor_head = matrix_new[:, 3][args_head]
|
||||
y_min_hor_head = matrix_new[:, 6][args_head]
|
||||
y_max_hor_head = matrix_new[:, 7][args_head]
|
||||
cy_hor_head = matrix_new[:, 5][args_head]
|
||||
|
||||
# split headings at toplines (y_min_head) and baselines (y_max_head)
|
||||
# instead of merely adding their center (cy_head) as horizontal separator
|
||||
# (x +/- 30px to avoid crossing col peaks by accident)
|
||||
x_min_hor_some = np.append(x_min_hor_some, np.tile(x_min_hor_head + 30, 2))
|
||||
x_max_hor_some = np.append(x_max_hor_some, np.tile(x_max_hor_head - 30, 2))
|
||||
y_max_hor_some = np.append(y_max_hor_some, # baselines
|
||||
np.concatenate((y_min_hor_head + 2,
|
||||
y_max_hor_head + 2)))
|
||||
cy_hor_some = np.append(cy_hor_some, # toplines
|
||||
np.concatenate((y_min_hor_head - 2,
|
||||
y_max_hor_head - 2)))
|
||||
|
||||
if right2left_readingorder:
|
||||
x_max_hor_some = width_tot - x_min_hor_some
|
||||
x_min_hor_some = width_tot - x_max_hor_some
|
||||
|
||||
|
||||
reading_order_type, x_starting, x_ending, y_mid, y_max, \
|
||||
y_mid_without_mother, x_start_without_mother, x_end_without_mother, \
|
||||
there_is_sep_with_child, \
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue