mirror of
https://github.com/qurator-spk/eynollah.git
synced 2025-12-01 08:44:13 +01:00
column detection: improve, aided by vseps whenever possible
- `find_number_of_columns_in_document`: retain vertical separators
and pass to `find_num_col` for each vertical split
- `return_boxes_of_images_by_order_of_reading_new`: reconstruct
the vertical separators from the segmentation mask and the separator
bboxes; pass it on to `find_num_col` everywhere
- `return_boxes_of_images_by_order_of_reading_new`: no need to
try-catch `find_num_col` anymore
- `return_boxes_of_images_by_order_of_reading_new`: when a vertical
split has too few columns,
* do not raise but lower the threshold `multiplier` responsible for
allowing gaps as column boundaries
* do not pass the `num_col_classifier` (i.e. expected number of
resulting columns) of the entire page to the iterative
`find_num_col` for each existing column, but only the portion
of that span
This commit is contained in:
parent
4dd40c542b
commit
5a3de3b42d
1 changed files with 68 additions and 29 deletions
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Tuple
|
from typing import List, Tuple
|
||||||
from logging import getLogger
|
from logging import getLogger
|
||||||
import time
|
import time
|
||||||
import math
|
import math
|
||||||
|
|
@ -1315,7 +1315,35 @@ def return_points_with_boundies(peaks_neg_fin, first_point, last_point):
|
||||||
peaks_neg_tot.append(last_point)
|
peaks_neg_tot.append(last_point)
|
||||||
return peaks_neg_tot
|
return peaks_neg_tot
|
||||||
|
|
||||||
def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, label_seps, contours_h=None):
|
def find_number_of_columns_in_document(
|
||||||
|
region_pre_p: np.ndarray,
|
||||||
|
num_col_classifier: int,
|
||||||
|
tables: bool,
|
||||||
|
label_seps: int,
|
||||||
|
contours_h: List[np.ndarray] = None,
|
||||||
|
logger=None
|
||||||
|
) -> Tuple[int, List[int], np.ndarray, List[int], np.ndarray]:
|
||||||
|
"""
|
||||||
|
Extract vertical and horizontal separators, vertical splits and horizontal column boundaries on page.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
* region_pre_p: segmentation map of the page
|
||||||
|
* num_col_classifier: predicted (expected) number of columns of the page
|
||||||
|
* tables: whether tables may be present
|
||||||
|
* label_seps: segmentation map class label for separators
|
||||||
|
* contours_h: polygons of potential headings (serving as additional horizontal separators)
|
||||||
|
* logger
|
||||||
|
|
||||||
|
Returns: a tuple of
|
||||||
|
* the actual number of columns found
|
||||||
|
* the x coordinates of the column boundaries
|
||||||
|
* an array of the separators (bounding boxes and types)
|
||||||
|
* the y coordinates of the page splits
|
||||||
|
* a mask of the separators
|
||||||
|
"""
|
||||||
|
if logger is None:
|
||||||
|
logger = getLogger(__package__)
|
||||||
|
|
||||||
separators_closeup = 1 * (region_pre_p == label_seps)
|
separators_closeup = 1 * (region_pre_p == label_seps)
|
||||||
separators_closeup[0:110] = 0
|
separators_closeup[0:110] = 0
|
||||||
separators_closeup[-150:] = 0
|
separators_closeup[-150:] = 0
|
||||||
|
|
@ -1483,8 +1511,11 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables,
|
||||||
num_big_parts += 1
|
num_big_parts += 1
|
||||||
try:
|
try:
|
||||||
num_col, peaks_neg_fin = find_num_col(regions_without_separators[top: bot],
|
num_col, peaks_neg_fin = find_num_col(regions_without_separators[top: bot],
|
||||||
num_col_classifier, tables, multiplier=7.0)
|
num_col_classifier, tables,
|
||||||
# print("big part %d:%d has %d columns" % (top, bot, num_col + 1), peaks_neg_fin)
|
vertical_separators=1 * (vertical[top: bot] > 0),
|
||||||
|
multiplier=7.0)
|
||||||
|
logger.debug("big part %d:%d has %d columns", top, bot, num_col + 1)
|
||||||
|
# print(peaks_neg_fin)
|
||||||
except:
|
except:
|
||||||
num_col = 0
|
num_col = 0
|
||||||
peaks_neg_fin = []
|
peaks_neg_fin = []
|
||||||
|
|
@ -1522,7 +1553,8 @@ def return_boxes_of_images_by_order_of_reading_new(
|
||||||
* matrix_of_seps: type and coordinates of horizontal and vertical separators,
|
* matrix_of_seps: type and coordinates of horizontal and vertical separators,
|
||||||
as well as headings
|
as well as headings
|
||||||
* num_col_classifier: predicted number of columns for the entire page
|
* num_col_classifier: predicted number of columns for the entire page
|
||||||
* erosion_hurts: bool
|
* erosion_hurts: whether region masks have already been eroded
|
||||||
|
(and thus gaps can be expected to be wider)
|
||||||
* tables: bool
|
* tables: bool
|
||||||
* right2left_readingorder: whether to invert the default left-to-right order
|
* right2left_readingorder: whether to invert the default left-to-right order
|
||||||
|
|
||||||
|
|
@ -1578,6 +1610,12 @@ def return_boxes_of_images_by_order_of_reading_new(
|
||||||
height_tot, width_tot = regions_without_separators.shape
|
height_tot, width_tot = regions_without_separators.shape
|
||||||
big_part = 22 * height_tot // 100 # percent height
|
big_part = 22 * height_tot // 100 # percent height
|
||||||
_, ccomps, cstats, _ = cv2.connectedComponentsWithStats(regions_without_separators.astype(np.uint8))
|
_, ccomps, cstats, _ = cv2.connectedComponentsWithStats(regions_without_separators.astype(np.uint8))
|
||||||
|
args_ver = matrix_of_seps_ch[:, 9] == 1
|
||||||
|
mask_ver = np.zeros_like(regions_without_separators, dtype=bool)
|
||||||
|
for i in np.flatnonzero(args_ver):
|
||||||
|
mask_ver[matrix_of_seps_ch[i, 6]: matrix_of_seps_ch[i, 7],
|
||||||
|
matrix_of_seps_ch[i, 2]: matrix_of_seps_ch[i, 3]] = True
|
||||||
|
vertical_seps = 1 * ((regions_with_separators == 6) & mask_ver)
|
||||||
for top, bot in pairwise(splitter_y_new):
|
for top, bot in pairwise(splitter_y_new):
|
||||||
# print("%d:%d" % (top, bot), 'i')
|
# print("%d:%d" % (top, bot), 'i')
|
||||||
# dbg_plt([0, None, top, bot], "image cut for y split %d:%d" % (top, bot))
|
# dbg_plt([0, None, top, bot], "image cut for y split %d:%d" % (top, bot))
|
||||||
|
|
@ -1589,16 +1627,13 @@ def return_boxes_of_images_by_order_of_reading_new(
|
||||||
#if (len(matrix_new[:,9][matrix_new[:,9]==1]) > 0 and
|
#if (len(matrix_new[:,9][matrix_new[:,9]==1]) > 0 and
|
||||||
# np.max(matrix_new[:,8][matrix_new[:,9]==1]) >=
|
# np.max(matrix_new[:,8][matrix_new[:,9]==1]) >=
|
||||||
# 0.1 * (np.abs(bot-top))):
|
# 0.1 * (np.abs(bot-top))):
|
||||||
try:
|
|
||||||
num_col, peaks_neg_fin = find_num_col(
|
num_col, peaks_neg_fin = find_num_col(
|
||||||
regions_without_separators[top:bot],
|
regions_without_separators[top:bot],
|
||||||
# we do not expect to get all columns in small parts (headings etc.):
|
# we do not expect to get all columns in small parts (headings etc.):
|
||||||
num_col_classifier if bot - top >= big_part else 1,
|
num_col_classifier if bot - top >= big_part else 1,
|
||||||
tables, multiplier=6. if erosion_hurts else 7.,
|
tables, vertical_separators=vertical_seps[top: bot],
|
||||||
|
multiplier=6. if erosion_hurts else 7.,
|
||||||
unbalanced=True)
|
unbalanced=True)
|
||||||
except:
|
|
||||||
peaks_neg_fin=[]
|
|
||||||
num_col = 0
|
|
||||||
try:
|
try:
|
||||||
if ((len(peaks_neg_fin) + 1 < num_col_classifier or
|
if ((len(peaks_neg_fin) + 1 < num_col_classifier or
|
||||||
num_col_classifier == 6) and
|
num_col_classifier == 6) and
|
||||||
|
|
@ -1606,12 +1641,18 @@ def return_boxes_of_images_by_order_of_reading_new(
|
||||||
bot - top >= big_part):
|
bot - top >= big_part):
|
||||||
# found too few columns here
|
# found too few columns here
|
||||||
#print('burda')
|
#print('burda')
|
||||||
|
logger.debug("searching for more than %d columns in big part %d:%d",
|
||||||
|
len(peaks_neg_fin) + 1, top, bot)
|
||||||
peaks_neg_fin_org = np.copy(peaks_neg_fin)
|
peaks_neg_fin_org = np.copy(peaks_neg_fin)
|
||||||
#print("peaks_neg_fin_org", peaks_neg_fin_org)
|
#print("peaks_neg_fin_org", peaks_neg_fin_org)
|
||||||
if len(peaks_neg_fin) == 0:
|
if len(peaks_neg_fin) == 0:
|
||||||
num_col, peaks_neg_fin = find_num_col(
|
num_col, peaks_neg_fin = find_num_col(
|
||||||
regions_without_separators[top:bot],
|
regions_without_separators[top:bot],
|
||||||
num_col_classifier, tables, multiplier=3., unbalanced=True)
|
num_col_classifier, tables,
|
||||||
|
vertical_separators=vertical_seps[top: bot],
|
||||||
|
# try to be less strict (lower threshold than above)
|
||||||
|
multiplier=7. if erosion_hurts else 8.,
|
||||||
|
unbalanced=True)
|
||||||
#print(peaks_neg_fin,'peaks_neg_fin')
|
#print(peaks_neg_fin,'peaks_neg_fin')
|
||||||
peaks_neg_fin_early = [0] + peaks_neg_fin + [width_tot-1]
|
peaks_neg_fin_early = [0] + peaks_neg_fin + [width_tot-1]
|
||||||
|
|
||||||
|
|
@ -1625,22 +1666,19 @@ def return_boxes_of_images_by_order_of_reading_new(
|
||||||
# plt.plot(regions_without_separators[top:bot, left:right].sum(axis=0))
|
# plt.plot(regions_without_separators[top:bot, left:right].sum(axis=0))
|
||||||
# plt.title("vertical projection (sum over y)")
|
# plt.title("vertical projection (sum over y)")
|
||||||
# plt.show()
|
# plt.show()
|
||||||
try:
|
# try to get more peaks with different multipliers
|
||||||
_, peaks_neg_fin1 = find_num_col(
|
num_col_expected = round((right - left) / width_tot * num_col_classifier)
|
||||||
regions_without_separators[top:bot, left:right],
|
args = regions_without_separators[top:bot, left:right], num_col_expected, tables
|
||||||
num_col_classifier, tables, multiplier=7.)
|
kwargs = dict(vertical_separators=vertical_seps[top: bot, left:right])
|
||||||
except:
|
_, peaks_neg_fin1 = find_num_col(*args, **kwargs, multiplier=7.)
|
||||||
peaks_neg_fin1 = []
|
_, peaks_neg_fin2 = find_num_col(*args, **kwargs, multiplier=5.)
|
||||||
try:
|
|
||||||
_, peaks_neg_fin2 = find_num_col(
|
|
||||||
regions_without_separators[top:bot, left:right],
|
|
||||||
num_col_classifier, tables, multiplier=5.)
|
|
||||||
except:
|
|
||||||
peaks_neg_fin2 = []
|
|
||||||
if len(peaks_neg_fin1) >= len(peaks_neg_fin2):
|
if len(peaks_neg_fin1) >= len(peaks_neg_fin2):
|
||||||
peaks_neg_fin = peaks_neg_fin1
|
peaks_neg_fin = peaks_neg_fin1
|
||||||
else:
|
else:
|
||||||
peaks_neg_fin = peaks_neg_fin2
|
peaks_neg_fin = peaks_neg_fin2
|
||||||
|
# print(peaks_neg_fin)
|
||||||
|
logger.debug("found %d additional column boundaries in %d:%d",
|
||||||
|
len(peaks_neg_fin), left, right)
|
||||||
# add offset to local result
|
# add offset to local result
|
||||||
peaks_neg_fin = list(np.array(peaks_neg_fin) + left)
|
peaks_neg_fin = list(np.array(peaks_neg_fin) + left)
|
||||||
#print(peaks_neg_fin,'peaks_neg_fin')
|
#print(peaks_neg_fin,'peaks_neg_fin')
|
||||||
|
|
@ -1652,6 +1690,7 @@ def return_boxes_of_images_by_order_of_reading_new(
|
||||||
#print(peaks_neg_fin_rev,'peaks_neg_fin_rev')
|
#print(peaks_neg_fin_rev,'peaks_neg_fin_rev')
|
||||||
|
|
||||||
if len(peaks_neg_fin_rev) >= len(peaks_neg_fin_org):
|
if len(peaks_neg_fin_rev) >= len(peaks_neg_fin_org):
|
||||||
|
#print("found more peaks than at first glance", peaks_neg_fin_rev, peaks_neg_fin_org)
|
||||||
peaks_neg_fin = peaks_neg_fin_rev
|
peaks_neg_fin = peaks_neg_fin_rev
|
||||||
else:
|
else:
|
||||||
peaks_neg_fin = peaks_neg_fin_org
|
peaks_neg_fin = peaks_neg_fin_org
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue