simplify, add confidence for headings as well

This commit is contained in:
Robert Sachunsky 2026-04-21 01:06:41 +02:00
parent 264b00f8ab
commit a2f43b8d69
2 changed files with 24 additions and 26 deletions

View file

@ -1712,19 +1712,20 @@ class Eynollah:
#print(time.time()-t_0_box,'time box in 3') #print(time.time()-t_0_box,'time box in 3')
t1 = time.time() t1 = time.time()
if np.abs(slope_deskew) < SLOPE_THRESHOLD: if np.abs(slope_deskew) < SLOPE_THRESHOLD:
boxes, peaks_neg_tot_tables = return_boxes_of_images_by_order_of_reading_new( boxes, _ = return_boxes_of_images_by_order_of_reading_new(
splitter_y_new, regions_without_separators, splitter_y_new, regions_without_separators,
text_regions_p == label_seps_fl, matrix_of_seps_ch, text_regions_p == label_seps_fl, matrix_of_seps_ch,
num_col_classifier, erosion_hurts, self.tables, self.right2left) num_col_classifier, erosion_hurts, self.tables, self.right2left,
logger=self.logger)
boxes_d = None boxes_d = None
self.logger.debug("len(boxes): %s", len(boxes)) self.logger.debug("len(boxes): %s", len(boxes))
#print(time.time()-t_0_box,'time box in 3.1') #print(time.time()-t_0_box,'time box in 3.1')
else: else:
boxes_d, peaks_neg_tot_tables_d = return_boxes_of_images_by_order_of_reading_new( boxes_d, _ = return_boxes_of_images_by_order_of_reading_new(
splitter_y_new_d, regions_without_separators_d, splitter_y_new_d, regions_without_separators_d,
text_regions_p_d == label_seps_fl, matrix_of_seps_ch_d, text_regions_p_d == label_seps_fl, matrix_of_seps_ch_d,
num_col_classifier, erosion_hurts, self.tables, self.right2left) num_col_classifier, erosion_hurts, self.tables, self.right2left,
logger=self.logger)
boxes = None boxes = None
self.logger.debug("len(boxes): %s", len(boxes_d)) self.logger.debug("len(boxes): %s", len(boxes_d))
@ -2843,20 +2844,14 @@ class Eynollah:
if not self.reading_order_machine_based: if not self.reading_order_machine_based:
label_seps = 6 label_seps = 6
if not self.headers_off:
if np.abs(slope_deskew) < SLOPE_THRESHOLD: if np.abs(slope_deskew) < SLOPE_THRESHOLD:
num_col, _, matrix_of_seps_ch, splitter_y_new, _ = find_number_of_columns_in_document( _, _, matrix_of_seps_ch, splitter_y_new, _ = find_number_of_columns_in_document(
text_regions_p, num_col_classifier, self.tables, label_seps, contours_only_text_parent_h) text_regions_p, num_col_classifier, self.tables, label_seps,
contours_h=None if self.headers_off else contours_only_text_parent_h)
else: else:
_, _, matrix_of_seps_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document( _, _, matrix_of_seps_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document(
text_regions_p_d, num_col_classifier, self.tables, label_seps, contours_only_text_parent_h_d_ordered) text_regions_p_d, num_col_classifier, self.tables, label_seps,
elif self.headers_off: contours_h=None if self.headers_off else contours_only_text_parent_h_d_ordered)
if np.abs(slope_deskew) < SLOPE_THRESHOLD:
num_col, _, matrix_of_seps_ch, splitter_y_new, _ = find_number_of_columns_in_document(
text_regions_p, num_col_classifier, self.tables, label_seps)
else:
_, _, matrix_of_seps_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document(
text_regions_p_d, num_col_classifier, self.tables, label_seps)
if not erosion_hurts: if not erosion_hurts:
if np.abs(slope_deskew) < SLOPE_THRESHOLD: if np.abs(slope_deskew) < SLOPE_THRESHOLD:

View file

@ -899,16 +899,19 @@ def split_textregion_main_vs_head(
h_o = regions_model_1.shape[0] h_o = regions_model_1.shape[0]
w_o = regions_model_1.shape[1] w_o = regions_model_1.shape[1]
zoom = 3 zoom = 3
regions_model_1 = cv2.resize(regions_model_1, (regions_model_1.shape[1] // zoom, regions_model_1 = cv2.resize(regions_model_1,
(regions_model_1.shape[1] // zoom,
regions_model_1.shape[0] // zoom), regions_model_1.shape[0] // zoom),
interpolation=cv2.INTER_NEAREST) interpolation=cv2.INTER_NEAREST)
regions_model_full = cv2.resize(regions_model_full, (regions_model_full.shape[1] // zoom, regions_model_full = cv2.resize(regions_model_full,
(regions_model_full.shape[1] // zoom,
regions_model_full.shape[0] // zoom), regions_model_full.shape[0] // zoom),
interpolation=cv2.INTER_NEAREST) interpolation=cv2.INTER_NEAREST)
contours_only_text_parent_z = [(cnt / zoom).astype(int) for cnt in contours_only_text_parent] contours_only_text_parent_z = [contour // zoom
for contour in contours_only_text_parent]
### ###
cx_main, cy_main, x_min_main, x_max_main, y_min_main, y_max_main, y_corr_x_min_from_argmin = \ _, _, x_min_main, x_max_main, y_min_main, y_max_main, _ = \
find_new_features_of_contours(contours_only_text_parent_z) find_new_features_of_contours(contours_only_text_parent_z)
length_con=x_max_main-x_min_main length_con=x_max_main-x_min_main
@ -947,7 +950,7 @@ def split_textregion_main_vs_head(
regions_model_1[(regions_model_1 == label_text) & (parent > 0)] = label_head_final regions_model_1[(regions_model_1 == label_text) & (parent > 0)] = label_head_final
contours_only_text_parent_head.append(contours_only_text_parent[ii]) contours_only_text_parent_head.append(contours_only_text_parent[ii])
conf_contours_head.append(None) # why not conf_contours[ii], too? conf_contours_head.append(conf_contours[ii])
if len(contours_only_text_parent_d_ordered): if len(contours_only_text_parent_d_ordered):
contours_only_text_parent_head_d.append(contours_only_text_parent_d_ordered[ii]) contours_only_text_parent_head_d.append(contours_only_text_parent_d_ordered[ii])
all_box_coord_head.append(all_box_coord[ii]) all_box_coord_head.append(all_box_coord[ii])