mirror of
https://github.com/qurator-spk/eynollah.git
synced 2025-12-01 08:44:13 +01:00
return_boxes_of_images_by_order_of_reading_new: improve
- when analysing regions spanning across columns, disregard tiny regions (smaller than half the median size) - if a region spans across columns just by a tiny fraction, and therefore is not good enough for a multi-col separator, then it should also not be good enough for a multi-col box maker
This commit is contained in:
parent
b71bb80e3a
commit
5abf0c1097
1 changed files with 8 additions and 4 deletions
|
|
@ -1720,6 +1720,7 @@ def return_boxes_of_images_by_order_of_reading_new(
|
||||||
# analyse connected components of regions to gain additional separators
|
# analyse connected components of regions to gain additional separators
|
||||||
# and prepare a map for cross-column boxes
|
# and prepare a map for cross-column boxes
|
||||||
ccounts = np.bincount(ccomps[top: bot].flatten())
|
ccounts = np.bincount(ccomps[top: bot].flatten())
|
||||||
|
ccounts_median = np.median(ccounts)
|
||||||
col_ccounts = np.stack([np.bincount(ccomps[top: bot, left: right].flatten(),
|
col_ccounts = np.stack([np.bincount(ccomps[top: bot, left: right].flatten(),
|
||||||
minlength=ccounts.size)
|
minlength=ccounts.size)
|
||||||
for left, right in pairwise(peaks_neg_tot)])
|
for left, right in pairwise(peaks_neg_tot)])
|
||||||
|
|
@ -1727,6 +1728,9 @@ def return_boxes_of_images_by_order_of_reading_new(
|
||||||
for label, label_count in enumerate(ccounts):
|
for label, label_count in enumerate(ccounts):
|
||||||
if not label:
|
if not label:
|
||||||
continue
|
continue
|
||||||
|
# ignore small labels for the purpose of finding multicol seps
|
||||||
|
if label_count < 0.5 * ccounts_median:
|
||||||
|
continue
|
||||||
label_left, label_top, label_width, label_height, label_area = cstats[label]
|
label_left, label_top, label_width, label_height, label_area = cstats[label]
|
||||||
# if label_count < 0.9 * label_area:
|
# if label_count < 0.9 * label_area:
|
||||||
# # mostly not in this part of the page
|
# # mostly not in this part of the page
|
||||||
|
|
@ -1738,15 +1742,15 @@ def return_boxes_of_images_by_order_of_reading_new(
|
||||||
label_bot = label_top + label_height
|
label_bot = label_top + label_height
|
||||||
label_start = np.flatnonzero(peaks_neg_tot > label_left)[0] - 1
|
label_start = np.flatnonzero(peaks_neg_tot > label_left)[0] - 1
|
||||||
label_end = np.flatnonzero(peaks_neg_tot >= label_right)[0]
|
label_end = np.flatnonzero(peaks_neg_tot >= label_right)[0]
|
||||||
|
if label_end - label_start < 2:
|
||||||
|
continue
|
||||||
|
if np.count_nonzero(col_ccounts[:, label] > 0.1 * label_count) < 2:
|
||||||
|
continue
|
||||||
# store as dict for multi-column boxes:
|
# store as dict for multi-column boxes:
|
||||||
for start in range(label_start, label_end):
|
for start in range(label_start, label_end):
|
||||||
labelcolmap.setdefault(start, list()).append(
|
labelcolmap.setdefault(start, list()).append(
|
||||||
(label_end, label_top, label_bot, sum(col_ccounts[start: label_end, label])))
|
(label_end, label_top, label_bot, sum(col_ccounts[start: label_end, label])))
|
||||||
# make additional separators:
|
# make additional separators:
|
||||||
if label_end - label_start < 2:
|
|
||||||
continue
|
|
||||||
if np.count_nonzero(col_ccounts[:, label] > 0.1 * label_count) < 2:
|
|
||||||
continue
|
|
||||||
x_min_hor_some = np.append(x_min_hor_some, [label_left] * 2)
|
x_min_hor_some = np.append(x_min_hor_some, [label_left] * 2)
|
||||||
x_max_hor_some = np.append(x_max_hor_some, [label_right] * 2)
|
x_max_hor_some = np.append(x_max_hor_some, [label_right] * 2)
|
||||||
y_min_hor_some = np.append(y_min_hor_some, [label_top - 2, label_bot])
|
y_min_hor_some = np.append(y_min_hor_some, [label_top - 2, label_bot])
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue