mirror of
https://github.com/qurator-spk/eynollah.git
synced 2025-12-01 08:44:13 +01:00
return_boxes_of_images_by_order_of_reading_new: improve
- when analysing regions spanning across columns, disregard tiny regions (smaller than half the median size) - if a region spans across columns just by a tiny fraction, and therefore is not good enough for a multi-col separator, then it should also not be good enough for a multi-col box maker
This commit is contained in:
parent
b71bb80e3a
commit
5abf0c1097
1 changed files with 8 additions and 4 deletions
|
|
@ -1720,6 +1720,7 @@ def return_boxes_of_images_by_order_of_reading_new(
|
|||
# analyse connected components of regions to gain additional separators
|
||||
# and prepare a map for cross-column boxes
|
||||
ccounts = np.bincount(ccomps[top: bot].flatten())
|
||||
ccounts_median = np.median(ccounts)
|
||||
col_ccounts = np.stack([np.bincount(ccomps[top: bot, left: right].flatten(),
|
||||
minlength=ccounts.size)
|
||||
for left, right in pairwise(peaks_neg_tot)])
|
||||
|
|
@ -1727,6 +1728,9 @@ def return_boxes_of_images_by_order_of_reading_new(
|
|||
for label, label_count in enumerate(ccounts):
|
||||
if not label:
|
||||
continue
|
||||
# ignore small labels for the purpose of finding multicol seps
|
||||
if label_count < 0.5 * ccounts_median:
|
||||
continue
|
||||
label_left, label_top, label_width, label_height, label_area = cstats[label]
|
||||
# if label_count < 0.9 * label_area:
|
||||
# # mostly not in this part of the page
|
||||
|
|
@ -1738,15 +1742,15 @@ def return_boxes_of_images_by_order_of_reading_new(
|
|||
label_bot = label_top + label_height
|
||||
label_start = np.flatnonzero(peaks_neg_tot > label_left)[0] - 1
|
||||
label_end = np.flatnonzero(peaks_neg_tot >= label_right)[0]
|
||||
if label_end - label_start < 2:
|
||||
continue
|
||||
if np.count_nonzero(col_ccounts[:, label] > 0.1 * label_count) < 2:
|
||||
continue
|
||||
# store as dict for multi-column boxes:
|
||||
for start in range(label_start, label_end):
|
||||
labelcolmap.setdefault(start, list()).append(
|
||||
(label_end, label_top, label_bot, sum(col_ccounts[start: label_end, label])))
|
||||
# make additional separators:
|
||||
if label_end - label_start < 2:
|
||||
continue
|
||||
if np.count_nonzero(col_ccounts[:, label] > 0.1 * label_count) < 2:
|
||||
continue
|
||||
x_min_hor_some = np.append(x_min_hor_some, [label_left] * 2)
|
||||
x_max_hor_some = np.append(x_max_hor_some, [label_right] * 2)
|
||||
y_min_hor_some = np.append(y_min_hor_some, [label_top - 2, label_bot])
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue