@ -149,6 +149,7 @@ class Eynollah:
dir_out = None ,
dir_in = None ,
dir_of_cropped_images = None ,
extract_only_images = False ,
dir_of_layout = None ,
dir_of_deskewed = None ,
dir_of_all = None ,
@ -196,6 +197,7 @@ class Eynollah:
self . allow_scaling = allow_scaling
self . headers_off = headers_off
self . light_version = light_version
self . extract_only_images = extract_only_images
self . ignore_page_extraction = ignore_page_extraction
self . pcgts = pcgts
if not dir_in :
@ -226,6 +228,7 @@ class Eynollah:
self . model_page_dir = dir_models + " /eynollah-page-extraction_20210425 "
self . model_region_dir_p_ens = dir_models + " /eynollah-main-regions-ensembled_20210425 "
self . model_region_dir_p_ens_light = dir_models + " /eynollah-main-regions_20220314 "
self . model_region_dir_p_ens_light_only_images_extraction = dir_models + " /eynollah-main-regions_20231127_672_org_ens_11_13_16_17_18 "
if self . textline_light :
self . model_textline_dir = dir_models + " /eynollah-textline_light_20210425 "
else :
@ -250,7 +253,23 @@ class Eynollah:
self . ls_imgs = os . listdir ( self . dir_in )
if dir_in and not light_version :
if dir_in and self . extract_only_images :
config = tf . compat . v1 . ConfigProto ( )
config . gpu_options . allow_growth = True
session = tf . compat . v1 . Session ( config = config )
set_session ( session )
self . model_page = self . our_load_model ( self . model_page_dir )
self . model_classifier = self . our_load_model ( self . model_dir_of_col_classifier )
self . model_bin = self . our_load_model ( self . model_dir_of_binarization )
#self.model_textline = self.our_load_model(self.model_textline_dir)
self . model_region = self . our_load_model ( self . model_region_dir_p_ens_light_only_images_extraction )
#self.model_region_fl_np = self.our_load_model(self.model_region_dir_fully_np)
#self.model_region_fl = self.our_load_model(self.model_region_dir_fully)
self . ls_imgs = os . listdir ( self . dir_in )
if dir_in and not ( light_version or self . extract_only_images ) :
config = tf . compat . v1 . ConfigProto ( )
config . gpu_options . allow_growth = True
session = tf . compat . v1 . Session ( config = config )
@ -464,6 +483,27 @@ class Eynollah:
return img_new , num_column_is_classified
def calculate_width_height_by_columns_extract_only_images ( self , img , num_col , width_early , label_p_pred ) :
self . logger . debug ( " enter calculate_width_height_by_columns " )
if num_col == 1 :
img_w_new = 700
elif num_col == 2 :
img_w_new = 900
elif num_col == 3 :
img_w_new = 1500
elif num_col == 4 :
img_w_new = 1800
elif num_col == 5 :
img_w_new = 2200
elif num_col == 6 :
img_w_new = 2500
img_h_new = int ( img . shape [ 0 ] / float ( img . shape [ 1 ] ) * img_w_new )
img_new = resize_image ( img , img_h_new , img_w_new )
num_column_is_classified = True
return img_new , num_column_is_classified
def resize_image_with_column_classifier ( self , is_image_enhanced , img_bin ) :
self . logger . debug ( " enter resize_image_with_column_classifier " )
if self . input_binary :
@ -571,13 +611,18 @@ class Eynollah:
self . logger . info ( " Found %d columns ( %s ) " , num_col , np . around ( label_p_pred , decimals = 5 ) )
if dpi < DPI_THRESHOLD :
img_new , num_column_is_classified = self . calculate_width_height_by_columns ( img , num_col , width_early , label_p_pred )
if light_version :
image_res = np . copy ( img_new )
if not self . extract_only_images :
if dpi < DPI_THRESHOLD :
img_new , num_column_is_classified = self . calculate_width_height_by_columns ( img , num_col , width_early , label_p_pred )
if light_version :
image_res = np . copy ( img_new )
else :
image_res = self . predict_enhancement ( img_new )
is_image_enhanced = True
else :
image_res = self . predict_enhancement ( img_new )
is_image_enhanced = True
num_column_is_classified = True
image_res = np . copy ( img )
is_image_enhanced = False
else :
num_column_is_classified = True
image_res = np . copy ( img )
@ -868,10 +913,14 @@ class Eynollah:
seg_not_base = label_p_pred [ 0 , : , : , 4 ]
##seg2 = -label_p_pred[0,:,:,2]
seg_not_base [ seg_not_base > 0.03 ] = 1
seg_not_base [ seg_not_base < 1 ] = 0
if self . extract_only_images :
#seg_not_base[seg_not_base>0.3] =1
seg_not_base [ seg_not_base > 0.5 ] = 1
seg_not_base [ seg_not_base < 1 ] = 0
else :
seg_not_base [ seg_not_base > 0.03 ] = 1
seg_not_base [ seg_not_base < 1 ] = 0
seg_test = label_p_pred [ 0 , : , : , 1 ]
@ -889,13 +938,10 @@ class Eynollah:
seg_line [ seg_line > 0.1 ] = 1
seg_line [ seg_line < 1 ] = 0
seg_background = label_p_pred [ 0 , : , : , 0 ]
##seg2 = -label_p_pred[0,:,:,2]
seg_background [ seg_background > 0.25 ] = 1
seg_background [ seg_background < 1 ] = 0
if not self . extract_only_images :
seg_background = label_p_pred [ 0 , : , : , 0 ]
seg_background [ seg_background > 0.25 ] = 1
seg_background [ seg_background < 1 ] = 0
##seg = seg+seg2
#seg = label_p_pred[0,:,:,2]
#seg[seg>0.4] =1
@ -908,8 +954,9 @@ class Eynollah:
##plt.show()
#seg[seg==1]=0
#seg[seg_test==1]=1
seg [ seg_not_base == 1 ] = 4
seg [ seg_background == 1 ] = 0
###seg[seg_not_base==1]=4
if not self . extract_only_images :
seg [ seg_background == 1 ] = 0
seg [ ( seg_line == 1 ) & ( seg == 0 ) ] = 3
seg_color = np . repeat ( seg [ : , : , np . newaxis ] , 3 , axis = 2 )
@ -1574,6 +1621,124 @@ class Eynollah:
q . put ( slopes_sub )
poly . put ( poly_sub )
box_sub . put ( boxes_sub_new )
def get_regions_light_v_extract_only_images ( self , img , is_image_enhanced , num_col_classifier ) :
self . logger . debug ( " enter get_regions_extract_images_only " )
erosion_hurts = False
img_org = np . copy ( img )
img_height_h = img_org . shape [ 0 ]
img_width_h = img_org . shape [ 1 ]
if num_col_classifier == 1 :
img_w_new = 700
elif num_col_classifier == 2 :
img_w_new = 900
elif num_col_classifier == 3 :
img_w_new = 1500
elif num_col_classifier == 4 :
img_w_new = 1800
elif num_col_classifier == 5 :
img_w_new = 2200
elif num_col_classifier == 6 :
img_w_new = 2500
img_h_new = int ( img . shape [ 0 ] / float ( img . shape [ 1 ] ) * img_w_new )
img_resized = resize_image ( img , img_h_new , img_w_new )
if not self . dir_in :
model_region , session_region = self . start_new_session_and_model ( self . model_region_dir_p_ens_light_only_images_extraction )
prediction_regions_org = self . do_prediction_new_concept ( True , img_resized , model_region )
else :
prediction_regions_org = self . do_prediction_new_concept ( True , img_resized , self . model_region )
#plt.imshow(prediction_regions_org[:,:,0])
#plt.show()
prediction_regions_org = resize_image ( prediction_regions_org , img_height_h , img_width_h )
image_page , page_coord , cont_page = self . extract_page ( )
prediction_regions_org = prediction_regions_org [ page_coord [ 0 ] : page_coord [ 1 ] , page_coord [ 2 ] : page_coord [ 3 ] ]
prediction_regions_org = prediction_regions_org [ : , : , 0 ]
mask_lines_only = ( prediction_regions_org [ : , : ] == 3 ) * 1
mask_texts_only = ( prediction_regions_org [ : , : ] == 1 ) * 1
mask_images_only = ( prediction_regions_org [ : , : ] == 2 ) * 1
polygons_lines_xml , hir_lines_xml = return_contours_of_image ( mask_lines_only )
polygons_lines_xml = textline_con_fil = filter_contours_area_of_image ( mask_lines_only , polygons_lines_xml , hir_lines_xml , max_area = 1 , min_area = 0.00001 )
polygons_of_only_texts = return_contours_of_interested_region ( mask_texts_only , 1 , 0.00001 )
polygons_of_only_lines = return_contours_of_interested_region ( mask_lines_only , 1 , 0.00001 )
text_regions_p_true = np . zeros ( prediction_regions_org . shape )
text_regions_p_true = cv2 . fillPoly ( text_regions_p_true , pts = polygons_of_only_lines , color = ( 3 , 3 , 3 ) )
text_regions_p_true [ : , : ] [ mask_images_only [ : , : ] == 1 ] = 2
text_regions_p_true = cv2 . fillPoly ( text_regions_p_true , pts = polygons_of_only_texts , color = ( 1 , 1 , 1 ) )
text_regions_p_true [ text_regions_p_true . shape [ 0 ] - 15 : text_regions_p_true . shape [ 0 ] , : ] = 0
text_regions_p_true [ : , text_regions_p_true . shape [ 1 ] - 15 : text_regions_p_true . shape [ 1 ] ] = 0
##polygons_of_images = return_contours_of_interested_region(text_regions_p_true, 2, 0.0001)
polygons_of_images = return_contours_of_interested_region ( text_regions_p_true , 2 , 0.001 )
image_boundary_of_doc = np . zeros ( ( text_regions_p_true . shape [ 0 ] , text_regions_p_true . shape [ 1 ] ) )
###image_boundary_of_doc[:6, :] = 1
###image_boundary_of_doc[text_regions_p_true.shape[0]-6:text_regions_p_true.shape[0], :] = 1
###image_boundary_of_doc[:, :6] = 1
###image_boundary_of_doc[:, text_regions_p_true.shape[1]-6:text_regions_p_true.shape[1]] = 1
#plt.imshow(image_boundary_of_doc)
#plt.show()
polygons_of_images_fin = [ ]
for ploy_img_ind in polygons_of_images :
"""
test_poly_image = np . zeros ( ( text_regions_p_true . shape [ 0 ] , text_regions_p_true . shape [ 1 ] ) )
test_poly_image = cv2 . fillPoly ( test_poly_image , pts = [ ploy_img_ind ] , color = ( 1 , 1 , 1 ) )
test_poly_image = test_poly_image [ : , : ] + image_boundary_of_doc [ : , : ]
test_poly_image_intersected_area = ( test_poly_image [ : , : ] == 2 ) * 1
test_poly_image_intersected_area = test_poly_image_intersected_area . sum ( )
if test_poly_image_intersected_area == 0 :
##polygons_of_images_fin.append(ploy_img_ind)
x , y , w , h = cv2 . boundingRect ( ploy_img_ind )
box = [ x , y , w , h ]
_ , page_coord_img = crop_image_inside_box ( box , text_regions_p_true )
#cont_page.append(np.array([[page_coord[2], page_coord[0]], [page_coord[3], page_coord[0]], [page_coord[3], page_coord[1]], [page_coord[2], page_coord[1]]]))
polygons_of_images_fin . append ( np . array ( [ [ page_coord_img [ 2 ] , page_coord_img [ 0 ] ] , [ page_coord_img [ 3 ] , page_coord_img [ 0 ] ] , [ page_coord_img [ 3 ] , page_coord_img [ 1 ] ] , [ page_coord_img [ 2 ] , page_coord_img [ 1 ] ] ] ) )
"""
x , y , w , h = cv2 . boundingRect ( ploy_img_ind )
if h < 150 or w < 150 :
pass
else :
box = [ x , y , w , h ]
_ , page_coord_img = crop_image_inside_box ( box , text_regions_p_true )
#cont_page.append(np.array([[page_coord[2], page_coord[0]], [page_coord[3], page_coord[0]], [page_coord[3], page_coord[1]], [page_coord[2], page_coord[1]]]))
polygons_of_images_fin . append ( np . array ( [ [ page_coord_img [ 2 ] , page_coord_img [ 0 ] ] , [ page_coord_img [ 3 ] , page_coord_img [ 0 ] ] , [ page_coord_img [ 3 ] , page_coord_img [ 1 ] ] , [ page_coord_img [ 2 ] , page_coord_img [ 1 ] ] ] ) )
return text_regions_p_true , erosion_hurts , polygons_lines_xml , polygons_of_images_fin , image_page , page_coord , cont_page
def get_regions_light_v ( self , img , is_image_enhanced , num_col_classifier ) :
self . logger . debug ( " enter get_regions_light_v " )
erosion_hurts = False
@ -2425,6 +2590,7 @@ class Eynollah:
prediction_table_erode = cv2 . erode ( prediction_table [ : , : , 0 ] , KERNEL , iterations = 20 )
prediction_table_erode = cv2 . dilate ( prediction_table_erode , KERNEL , iterations = 20 )
return prediction_table_erode . astype ( np . int16 )
def run_graphics_and_columns_light ( self , text_regions_p_1 , textline_mask_tot_ea , num_col_classifier , num_column_is_classified , erosion_hurts ) :
img_g = self . imread ( grayscale = True , uint8 = True )
@ -2826,7 +2992,6 @@ class Eynollah:
"""
self . logger . debug ( " enter run " )
t0_tot = time . time ( )
if not self . dir_in :
@ -2837,276 +3002,295 @@ class Eynollah:
if self . dir_in :
self . reset_file_name_dir ( os . path . join ( self . dir_in , img_name ) )
img_res , is_image_enhanced , num_col_classifier , num_column_is_classified = self . run_enhancement ( self . light_version )
self . logger . info ( " Enhancing took %.1f s " , time . time ( ) - t0 )
t1 = time . time ( )
if self . light_version :
text_regions_p_1 , erosion_hurts , polygons_lines_xml , textline_mask_tot_ea = self . get_regions_light_v ( img_res , is_image_enhanced , num_col_classifier )
slope_deskew , slope_first = self . run_deskew ( textline_mask_tot_ea )
#self.logger.info("Textregion detection took %.1fs ", time.time() - t1t)
num_col , num_col_classifier , img_only_regions , page_coord , image_page , mask_images , mask_lines , text_regions_p_1 , cont_page , table_prediction , textline_mask_tot_ea = \
self . run_graphics_and_columns_light ( text_regions_p_1 , textline_mask_tot_ea , num_col_classifier , num_column_is_classified , erosion_hurts )
#self.logger.info("run graphics %.1fs ", time.time() - t1t)
textline_mask_tot_ea_org = np . copy ( textline_mask_tot_ea )
else :
text_regions_p_1 , erosion_hurts , polygons_lines_xml = self . get_regions_from_xy_2models ( img_res , is_image_enhanced , num_col_classifier )
self . logger . info ( " Textregion detection took %.1f s " , time . time ( ) - t1 )
t1 = time . time ( )
num_col , num_col_classifier , img_only_regions , page_coord , image_page , mask_images , mask_lines , text_regions_p_1 , cont_page , table_prediction = \
self . run_graphics_and_columns ( text_regions_p_1 , num_col_classifier , num_column_is_classified , erosion_hurts )
self . logger . info ( " Graphics detection took %.1f s " , time . time ( ) - t1 )
#self.logger.info('cont_page %s', cont_page)
if not num_col :
self . logger . info ( " No columns detected, outputting an empty PAGE-XML " )
pcgts = self . writer . build_pagexml_no_full_layout ( [ ] , page_coord , [ ] , [ ] , [ ] , [ ] , [ ] , [ ] , [ ] , [ ] , [ ] , [ ] , cont_page , [ ] , [ ] )
self . logger . info ( " Job done in %.1f s " , time . time ( ) - t1 )
if self . extract_only_images :
img_res , is_image_enhanced , num_col_classifier , num_column_is_classified = self . run_enhancement ( self . light_version )
self . logger . info ( " Enhancing took %.1f s " , time . time ( ) - t0 )
text_regions_p_1 , erosion_hurts , polygons_lines_xml , polygons_of_images , image_page , page_coord , cont_page = self . get_regions_light_v_extract_only_images ( img_res , is_image_enhanced , num_col_classifier )
pcgts = self . writer . build_pagexml_no_full_layout ( [ ] , page_coord , [ ] , [ ] , [ ] , [ ] , polygons_of_images , [ ] , [ ] , [ ] , [ ] , [ ] , cont_page , [ ] , [ ] )
if self . plotter :
self . plotter . write_images_into_directory ( polygons_of_images , image_page )
if self . dir_in :
self . writer . write_pagexml ( pcgts )
continue
else :
return pcgts
t1 = time . time ( )
if not self . light_version :
textline_mask_tot_ea = self . run_textline ( image_page )
self . logger . info ( " textline detection took %.1f s " , time . time ( ) - t1 )
else :
img_res , is_image_enhanced , num_col_classifier , num_column_is_classified = self . run_enhancement ( self . light_version )
self . logger . info ( " Enhancing took %.1f s " , time . time ( ) - t0 )
t1 = time . time ( )
slope_deskew , slope_first = self . run_deskew ( textline_mask_tot_ea )
self . logger . info ( " deskewing took %.1f s " , time . time ( ) - t1 )
t1 = time . time ( )
#plt.imshow(table_prediction)
#plt.show()
textline_mask_tot , text_regions_p , image_page_rotated = self . run_marginals ( image_page , textline_mask_tot_ea , mask_images , mask_lines , num_col_classifier , slope_deskew , text_regions_p_1 , table_prediction )
self . logger . info ( " detection of marginals took %.1f s " , time . time ( ) - t1 )
t1 = time . time ( )
if not self . full_layout :
polygons_of_images , img_revised_tab , text_regions_p_1_n , textline_mask_tot_d , regions_without_separators_d , boxes , boxes_d , polygons_of_marginals , contours_tables = self . run_boxes_no_full_layout ( image_page , textline_mask_tot , text_regions_p , slope_deskew , num_col_classifier , table_prediction , erosion_hurts )
if self . full_layout :
polygons_of_images , img_revised_tab , text_regions_p_1_n , textline_mask_tot_d , regions_without_separators_d , regions_fully , regions_without_separators , polygons_of_marginals , contours_tables = self . run_boxes_full_layout ( image_page , textline_mask_tot , text_regions_p , slope_deskew , num_col_classifier , img_only_regions , table_prediction , erosion_hurts )
text_only = ( ( img_revised_tab [ : , : ] == 1 ) ) * 1
if np . abs ( slope_deskew ) > = SLOPE_THRESHOLD :
text_only_d = ( ( text_regions_p_1_n [ : , : ] == 1 ) ) * 1
min_con_area = 0.000005
if np . abs ( slope_deskew ) > = SLOPE_THRESHOLD :
contours_only_text , hir_on_text = return_contours_of_image ( text_only )
contours_only_text_parent = return_parent_contours ( contours_only_text , hir_on_text )
if len ( contours_only_text_parent ) > 0 :
areas_cnt_text = np . array ( [ cv2 . contourArea ( c ) for c in contours_only_text_parent ] )
areas_cnt_text = areas_cnt_text / float ( text_only . shape [ 0 ] * text_only . shape [ 1 ] )
#self.logger.info('areas_cnt_text %s', areas_cnt_text)
contours_biggest = contours_only_text_parent [ np . argmax ( areas_cnt_text ) ]
contours_only_text_parent = [ c for jz , c in enumerate ( contours_only_text_parent ) if areas_cnt_text [ jz ] > min_con_area ]
areas_cnt_text_parent = [ area for area in areas_cnt_text if area > min_con_area ]
index_con_parents = np . argsort ( areas_cnt_text_parent )
contours_only_text_parent = list ( np . array ( contours_only_text_parent , dtype = object ) [ index_con_parents ] )
areas_cnt_text_parent = list ( np . array ( areas_cnt_text_parent ) [ index_con_parents ] )
cx_bigest_big , cy_biggest_big , _ , _ , _ , _ , _ = find_new_features_of_contours ( [ contours_biggest ] )
cx_bigest , cy_biggest , _ , _ , _ , _ , _ = find_new_features_of_contours ( contours_only_text_parent )
contours_only_text_d , hir_on_text_d = return_contours_of_image ( text_only_d )
contours_only_text_parent_d = return_parent_contours ( contours_only_text_d , hir_on_text_d )
areas_cnt_text_d = np . array ( [ cv2 . contourArea ( c ) for c in contours_only_text_parent_d ] )
areas_cnt_text_d = areas_cnt_text_d / float ( text_only_d . shape [ 0 ] * text_only_d . shape [ 1 ] )
if len ( areas_cnt_text_d ) > 0 :
contours_biggest_d = contours_only_text_parent_d [ np . argmax ( areas_cnt_text_d ) ]
index_con_parents_d = np . argsort ( areas_cnt_text_d )
contours_only_text_parent_d = list ( np . array ( contours_only_text_parent_d , dtype = object ) [ index_con_parents_d ] )
areas_cnt_text_d = list ( np . array ( areas_cnt_text_d ) [ index_con_parents_d ] )
cx_bigest_d_big , cy_biggest_d_big , _ , _ , _ , _ , _ = find_new_features_of_contours ( [ contours_biggest_d ] )
cx_bigest_d , cy_biggest_d , _ , _ , _ , _ , _ = find_new_features_of_contours ( contours_only_text_parent_d )
try :
if len ( cx_bigest_d ) > = 5 :
cx_bigest_d_last5 = cx_bigest_d [ - 5 : ]
cy_biggest_d_last5 = cy_biggest_d [ - 5 : ]
dists_d = [ math . sqrt ( ( cx_bigest_big [ 0 ] - cx_bigest_d_last5 [ j ] ) * * 2 + ( cy_biggest_big [ 0 ] - cy_biggest_d_last5 [ j ] ) * * 2 ) for j in range ( len ( cy_biggest_d_last5 ) ) ]
ind_largest = len ( cx_bigest_d ) - 5 + np . argmin ( dists_d )
else :
cx_bigest_d_last5 = cx_bigest_d [ - len ( cx_bigest_d ) : ]
cy_biggest_d_last5 = cy_biggest_d [ - len ( cx_bigest_d ) : ]
dists_d = [ math . sqrt ( ( cx_bigest_big [ 0 ] - cx_bigest_d_last5 [ j ] ) * * 2 + ( cy_biggest_big [ 0 ] - cy_biggest_d_last5 [ j ] ) * * 2 ) for j in range ( len ( cy_biggest_d_last5 ) ) ]
ind_largest = len ( cx_bigest_d ) - len ( cx_bigest_d ) + np . argmin ( dists_d )
cx_bigest_d_big [ 0 ] = cx_bigest_d [ ind_largest ]
cy_biggest_d_big [ 0 ] = cy_biggest_d [ ind_largest ]
except Exception as why :
self . logger . error ( why )
( h , w ) = text_only . shape [ : 2 ]
center = ( w / / 2.0 , h / / 2.0 )
M = cv2 . getRotationMatrix2D ( center , slope_deskew , 1.0 )
M_22 = np . array ( M ) [ : 2 , : 2 ]
p_big = np . dot ( M_22 , [ cx_bigest_big , cy_biggest_big ] )
x_diff = p_big [ 0 ] - cx_bigest_d_big
y_diff = p_big [ 1 ] - cy_biggest_d_big
if self . light_version :
text_regions_p_1 , erosion_hurts , polygons_lines_xml , textline_mask_tot_ea = self . get_regions_light_v ( img_res , is_image_enhanced , num_col_classifier )
slope_deskew , slope_first = self . run_deskew ( textline_mask_tot_ea )
#self.logger.info("Textregion detection took %.1fs ", time.time() - t1t)
num_col , num_col_classifier , img_only_regions , page_coord , image_page , mask_images , mask_lines , text_regions_p_1 , cont_page , table_prediction , textline_mask_tot_ea = \
self . run_graphics_and_columns_light ( text_regions_p_1 , textline_mask_tot_ea , num_col_classifier , num_column_is_classified , erosion_hurts )
#self.logger.info("run graphics %.1fs ", time.time() - t1t)
textline_mask_tot_ea_org = np . copy ( textline_mask_tot_ea )
else :
text_regions_p_1 , erosion_hurts , polygons_lines_xml = self . get_regions_from_xy_2models ( img_res , is_image_enhanced , num_col_classifier )
self . logger . info ( " Textregion detection took %.1f s " , time . time ( ) - t1 )
t1 = time . time ( )
num_col , num_col_classifier , img_only_regions , page_coord , image_page , mask_images , mask_lines , text_regions_p_1 , cont_page , table_prediction = \
self . run_graphics_and_columns ( text_regions_p_1 , num_col_classifier , num_column_is_classified , erosion_hurts )
self . logger . info ( " Graphics detection took %.1f s " , time . time ( ) - t1 )
#self.logger.info('cont_page %s', cont_page)
if not num_col :
self . logger . info ( " No columns detected, outputting an empty PAGE-XML " )
pcgts = self . writer . build_pagexml_no_full_layout ( [ ] , page_coord , [ ] , [ ] , [ ] , [ ] , [ ] , [ ] , [ ] , [ ] , [ ] , [ ] , cont_page , [ ] , [ ] )
self . logger . info ( " Job done in %.1f s " , time . time ( ) - t1 )
if self . dir_in :
self . writer . write_pagexml ( pcgts )
continue
else :
return pcgts
t1 = time . time ( )
if not self . light_version :
textline_mask_tot_ea = self . run_textline ( image_page )
self . logger . info ( " textline detection took %.1f s " , time . time ( ) - t1 )
t1 = time . time ( )
slope_deskew , slope_first = self . run_deskew ( textline_mask_tot_ea )
self . logger . info ( " deskewing took %.1f s " , time . time ( ) - t1 )
t1 = time . time ( )
#plt.imshow(table_prediction)
#plt.show()
textline_mask_tot , text_regions_p , image_page_rotated = self . run_marginals ( image_page , textline_mask_tot_ea , mask_images , mask_lines , num_col_classifier , slope_deskew , text_regions_p_1 , table_prediction )
self . logger . info ( " detection of marginals took %.1f s " , time . time ( ) - t1 )
t1 = time . time ( )
if not self . full_layout :
polygons_of_images , img_revised_tab , text_regions_p_1_n , textline_mask_tot_d , regions_without_separators_d , boxes , boxes_d , polygons_of_marginals , contours_tables = self . run_boxes_no_full_layout ( image_page , textline_mask_tot , text_regions_p , slope_deskew , num_col_classifier , table_prediction , erosion_hurts )
if self . full_layout :
polygons_of_images , img_revised_tab , text_regions_p_1_n , textline_mask_tot_d , regions_without_separators_d , regions_fully , regions_without_separators , polygons_of_marginals , contours_tables = self . run_boxes_full_layout ( image_page , textline_mask_tot , text_regions_p , slope_deskew , num_col_classifier , img_only_regions , table_prediction , erosion_hurts )
text_only = ( ( img_revised_tab [ : , : ] == 1 ) ) * 1
if np . abs ( slope_deskew ) > = SLOPE_THRESHOLD :
text_only_d = ( ( text_regions_p_1_n [ : , : ] == 1 ) ) * 1
min_con_area = 0.000005
if np . abs ( slope_deskew ) > = SLOPE_THRESHOLD :
contours_only_text , hir_on_text = return_contours_of_image ( text_only )
contours_only_text_parent = return_parent_contours ( contours_only_text , hir_on_text )
if len ( contours_only_text_parent ) > 0 :
areas_cnt_text = np . array ( [ cv2 . contourArea ( c ) for c in contours_only_text_parent ] )
areas_cnt_text = areas_cnt_text / float ( text_only . shape [ 0 ] * text_only . shape [ 1 ] )
#self.logger.info('areas_cnt_text %s', areas_cnt_text)
contours_biggest = contours_only_text_parent [ np . argmax ( areas_cnt_text ) ]
contours_only_text_parent = [ c for jz , c in enumerate ( contours_only_text_parent ) if areas_cnt_text [ jz ] > min_con_area ]
areas_cnt_text_parent = [ area for area in areas_cnt_text if area > min_con_area ]
index_con_parents = np . argsort ( areas_cnt_text_parent )
contours_only_text_parent = list ( np . array ( contours_only_text_parent , dtype = object ) [ index_con_parents ] )
areas_cnt_text_parent = list ( np . array ( areas_cnt_text_parent ) [ index_con_parents ] )
cx_bigest_big , cy_biggest_big , _ , _ , _ , _ , _ = find_new_features_of_contours ( [ contours_biggest ] )
cx_bigest , cy_biggest , _ , _ , _ , _ , _ = find_new_features_of_contours ( contours_only_text_parent )
contours_only_text_d , hir_on_text_d = return_contours_of_image ( text_only_d )
contours_only_text_parent_d = return_parent_contours ( contours_only_text_d , hir_on_text_d )
areas_cnt_text_d = np . array ( [ cv2 . contourArea ( c ) for c in contours_only_text_parent_d ] )
areas_cnt_text_d = areas_cnt_text_d / float ( text_only_d . shape [ 0 ] * text_only_d . shape [ 1 ] )
if len ( areas_cnt_text_d ) > 0 :
contours_biggest_d = contours_only_text_parent_d [ np . argmax ( areas_cnt_text_d ) ]
index_con_parents_d = np . argsort ( areas_cnt_text_d )
contours_only_text_parent_d = list ( np . array ( contours_only_text_parent_d , dtype = object ) [ index_con_parents_d ] )
areas_cnt_text_d = list ( np . array ( areas_cnt_text_d ) [ index_con_parents_d ] )
cx_bigest_d_big , cy_biggest_d_big , _ , _ , _ , _ , _ = find_new_features_of_contours ( [ contours_biggest_d ] )
cx_bigest_d , cy_biggest_d , _ , _ , _ , _ , _ = find_new_features_of_contours ( contours_only_text_parent_d )
try :
if len ( cx_bigest_d ) > = 5 :
cx_bigest_d_last5 = cx_bigest_d [ - 5 : ]
cy_biggest_d_last5 = cy_biggest_d [ - 5 : ]
dists_d = [ math . sqrt ( ( cx_bigest_big [ 0 ] - cx_bigest_d_last5 [ j ] ) * * 2 + ( cy_biggest_big [ 0 ] - cy_biggest_d_last5 [ j ] ) * * 2 ) for j in range ( len ( cy_biggest_d_last5 ) ) ]
ind_largest = len ( cx_bigest_d ) - 5 + np . argmin ( dists_d )
else :
cx_bigest_d_last5 = cx_bigest_d [ - len ( cx_bigest_d ) : ]
cy_biggest_d_last5 = cy_biggest_d [ - len ( cx_bigest_d ) : ]
dists_d = [ math . sqrt ( ( cx_bigest_big [ 0 ] - cx_bigest_d_last5 [ j ] ) * * 2 + ( cy_biggest_big [ 0 ] - cy_biggest_d_last5 [ j ] ) * * 2 ) for j in range ( len ( cy_biggest_d_last5 ) ) ]
ind_largest = len ( cx_bigest_d ) - len ( cx_bigest_d ) + np . argmin ( dists_d )
cx_bigest_d_big [ 0 ] = cx_bigest_d [ ind_largest ]
cy_biggest_d_big [ 0 ] = cy_biggest_d [ ind_largest ]
except Exception as why :
self . logger . error ( why )
( h , w ) = text_only . shape [ : 2 ]
center = ( w / / 2.0 , h / / 2.0 )
M = cv2 . getRotationMatrix2D ( center , slope_deskew , 1.0 )
M_22 = np . array ( M ) [ : 2 , : 2 ]
p_big = np . dot ( M_22 , [ cx_bigest_big , cy_biggest_big ] )
x_diff = p_big [ 0 ] - cx_bigest_d_big
y_diff = p_big [ 1 ] - cy_biggest_d_big
contours_only_text_parent_d_ordered = [ ]
for i in range ( len ( contours_only_text_parent ) ) :
p = np . dot ( M_22 , [ cx_bigest [ i ] , cy_biggest [ i ] ] )
p [ 0 ] = p [ 0 ] - x_diff [ 0 ]
p [ 1 ] = p [ 1 ] - y_diff [ 0 ]
dists = [ math . sqrt ( ( p [ 0 ] - cx_bigest_d [ j ] ) * * 2 + ( p [ 1 ] - cy_biggest_d [ j ] ) * * 2 ) for j in range ( len ( cx_bigest_d ) ) ]
contours_only_text_parent_d_ordered . append ( contours_only_text_parent_d [ np . argmin ( dists ) ] )
# img2=np.zeros((text_only.shape[0],text_only.shape[1],3))
# img2=cv2.fillPoly(img2,pts=[contours_only_text_parent_d[np.argmin(dists)]] ,color=(1,1,1))
# plt.imshow(img2[:,:,0])
# plt.show()
else :
contours_only_text_parent_d_ordered = [ ]
contours_only_text_parent_d = [ ]
contours_only_text_parent = [ ]
contours_only_text_parent_d_ordered = [ ]
for i in range ( len ( contours_only_text_parent ) ) :
p = np . dot ( M_22 , [ cx_bigest [ i ] , cy_biggest [ i ] ] )
p [ 0 ] = p [ 0 ] - x_diff [ 0 ]
p [ 1 ] = p [ 1 ] - y_diff [ 0 ]
dists = [ math . sqrt ( ( p [ 0 ] - cx_bigest_d [ j ] ) * * 2 + ( p [ 1 ] - cy_biggest_d [ j ] ) * * 2 ) for j in range ( len ( cx_bigest_d ) ) ]
contours_only_text_parent_d_ordered . append ( contours_only_text_parent_d [ np . argmin ( dists ) ] )
# img2=np.zeros((text_only.shape[0],text_only.shape[1],3))
# img2=cv2.fillPoly(img2,pts=[contours_only_text_parent_d[np.argmin(dists)]] ,color=(1,1,1))
# plt.imshow(img2[:,:,0])
# plt.show()
else :
contours_only_text_parent_d_ordered = [ ]
contours_only_text_parent_d = [ ]
contours_only_text_parent = [ ]
else :
contours_only_text_parent_d_ordered = [ ]
contours_only_text_parent_d = [ ]
contours_only_text_parent = [ ]
else :
contours_only_text , hir_on_text = return_contours_of_image ( text_only )
contours_only_text_parent = return_parent_contours ( contours_only_text , hir_on_text )
if len ( contours_only_text_parent ) > 0 :
areas_cnt_text = np . array ( [ cv2 . contourArea ( c ) for c in contours_only_text_parent ] )
areas_cnt_text = areas_cnt_text / float ( text_only . shape [ 0 ] * text_only . shape [ 1 ] )
contours_biggest = contours_only_text_parent [ np . argmax ( areas_cnt_text ) ]
contours_only_text_parent = [ c for jz , c in enumerate ( contours_only_text_parent ) if areas_cnt_text [ jz ] > min_con_area ]
areas_cnt_text_parent = [ area for area in areas_cnt_text if area > min_con_area ]
index_con_parents = np . argsort ( areas_cnt_text_parent )
contours_only_text_parent = list ( np . array ( contours_only_text_parent , dtype = object ) [ index_con_parents ] )
areas_cnt_text_parent = list ( np . array ( areas_cnt_text_parent ) [ index_con_parents ] )
cx_bigest_big , cy_biggest_big , _ , _ , _ , _ , _ = find_new_features_of_contours ( [ contours_biggest ] )
cx_bigest , cy_biggest , _ , _ , _ , _ , _ = find_new_features_of_contours ( contours_only_text_parent )
#self.logger.debug('areas_cnt_text_parent %s', areas_cnt_text_parent)
# self.logger.debug('areas_cnt_text_parent_d %s', areas_cnt_text_parent_d)
# self.logger.debug('len(contours_only_text_parent) %s', len(contours_only_text_parent_d))
else :
pass
if self . light_version :
txt_con_org = get_textregion_contours_in_org_image_light ( contours_only_text_parent , self . image , slope_first )
else :
txt_con_org = get_textregion_contours_in_org_image ( contours_only_text_parent , self . image , slope_first )
boxes_text , _ = get_text_region_boxes_by_given_contours ( contours_only_text_parent )
boxes_marginals , _ = get_text_region_boxes_by_given_contours ( polygons_of_marginals )
if not self . curved_line :
if self . light_version :
if self . textline_light :
slopes , all_found_textline_polygons , boxes_text , txt_con_org , contours_only_text_parent , all_box_coord , index_by_text_par_con = self . get_slopes_and_deskew_new_light ( txt_con_org , contours_only_text_parent , textline_mask_tot_ea_org , image_page_rotated , boxes_text , slope_deskew )
slopes_marginals , all_found_textline_polygons_marginals , boxes_marginals , _ , polygons_of_marginals , all_box_coord_marginals , _ = self . get_slopes_and_deskew_new_light ( polygons_of_marginals , polygons_of_marginals , textline_mask_tot_ea_org , image_page_rotated , boxes_marginals , slope_deskew )
contours_only_text , hir_on_text = return_contours_of_image ( text_only )
contours_only_text_parent = return_parent_contours ( contours_only_text , hir_on_text )
if len ( contours_only_text_parent ) > 0 :
areas_cnt_text = np . array ( [ cv2 . contourArea ( c ) for c in contours_only_text_parent ] )
areas_cnt_text = areas_cnt_text / float ( text_only . shape [ 0 ] * text_only . shape [ 1 ] )
contours_biggest = contours_only_text_parent [ np . argmax ( areas_cnt_text ) ]
contours_only_text_parent = [ c for jz , c in enumerate ( contours_only_text_parent ) if areas_cnt_text [ jz ] > min_con_area ]
areas_cnt_text_parent = [ area for area in areas_cnt_text if area > min_con_area ]
index_con_parents = np . argsort ( areas_cnt_text_parent )
contours_only_text_parent = list ( np . array ( contours_only_text_parent , dtype = object ) [ index_con_parents ] )
areas_cnt_text_parent = list ( np . array ( areas_cnt_text_parent ) [ index_con_parents ] )
cx_bigest_big , cy_biggest_big , _ , _ , _ , _ , _ = find_new_features_of_contours ( [ contours_biggest ] )
cx_bigest , cy_biggest , _ , _ , _ , _ , _ = find_new_features_of_contours ( contours_only_text_parent )
#self.logger.debug('areas_cnt_text_parent %s', areas_cnt_text_parent)
# self.logger.debug('areas_cnt_text_parent_d %s', areas_cnt_text_parent_d)
# self.logger.debug('len(contours_only_text_parent) %s', len(contours_only_text_parent_d))
else :
slopes , all_found_textline_polygons , boxes_text , txt_con_org , contours_only_text_parent , all_box_coord , index_by_text_par_con = self . get_slopes_and_deskew_new_light ( txt_con_org , contours_only_text_parent , textline_mask_tot_ea , image_page_rotated , boxes_text , slope_deskew )
slopes_marginals , all_found_textline_polygons_marginals , boxes_marginals , _ , polygons_of_marginals , all_box_coord_marginals , _ = self . get_slopes_and_deskew_new_light ( polygons_of_marginals , polygons_of_marginals , textline_mask_tot_ea , image_page_rotated , boxes_marginals , slope_deskew )
pass
if self . light_version :
txt_con_org = get_textregion_contours_in_org_image_light ( contours_only_text_parent , self . image , slope_first )
else :
slopes , all_found_textline_polygons , boxes_text , txt_con_org , contours_only_text_parent , all_box_coord , index_by_text_par_con = self . get_slopes_and_deskew_new ( txt_con_org , contours_only_text_parent , textline_mask_tot_ea , image_page_rotated , boxes_text , slope_deskew )
slopes_marginals , all_found_textline_polygons_marginals , boxes_marginals , _ , polygons_of_marginals , all_box_coord_marginals , _ = self . get_slopes_and_deskew_new ( polygons_of_marginals , polygons_of_marginals , textline_mask_tot_ea , image_page_rotated , boxes_marginals , slope_deskew )
txt_con_org = get_textregion_contours_in_org_image ( contours_only_text_parent , self . image , slope_first )
boxes_text , _ = get_text_region_boxes_by_given_contours ( contours_only_text_parent )
boxes_marginals , _ = get_text_region_boxes_by_given_contours ( polygons_of_marginals )
else :
scale_param = 1
all_found_textline_polygons , boxes_text , txt_con_org , contours_only_text_parent , all_box_coord , index_by_text_par_con , slopes = self . get_slopes_and_deskew_new_curved ( txt_con_org , contours_only_text_parent , cv2 . erode ( textline_mask_tot_ea , kernel = KERNEL , iterations = 1 ) , image_page_rotated , boxes_text , text_only , num_col_classifier , scale_param , slope_deskew )
all_found_textline_polygons = small_textlines_to_parent_adherence2 ( all_found_textline_polygons , textline_mask_tot_ea , num_col_classifier )
all_found_textline_polygons_marginals , boxes_marginals , _ , polygons_of_marginals , all_box_coord_marginals , _ , slopes_marginals = self . get_slopes_and_deskew_new_curved ( polygons_of_marginals , polygons_of_marginals , cv2 . erode ( textline_mask_tot_ea , kernel = KERNEL , iterations = 1 ) , image_page_rotated , boxes_marginals , text_only , num_col_classifier , scale_param , slope_deskew )
all_found_textline_polygons_marginals = small_textlines_to_parent_adherence2 ( all_found_textline_polygons_marginals , textline_mask_tot_ea , num_col_classifier )
if self . full_layout :
if np . abs ( slope_deskew ) > = SLOPE_THRESHOLD :
contours_only_text_parent_d_ordered = list ( np . array ( contours_only_text_parent_d_ordered , dtype = object ) [ index_by_text_par_con ] )
if not self . curved_line :
if self . light_version :
text_regions_p , contours_only_text_parent , contours_only_text_parent_h , all_box_coord , all_box_coord_h , all_found_textline_polygons , all_found_textline_polygons_h , slopes , slopes_h , contours_only_text_parent_d_ordered , contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header_light ( text_regions_p , regions_fully , contours_only_text_parent , all_box_coord , all_found_textline_polygons , slopes , contours_only_text_parent_d_ordered )
if self . textline_light :
slopes , all_found_textline_polygons , boxes_text , txt_con_org , contours_only_text_parent , all_box_coord , index_by_text_par_con = self . get_slopes_and_deskew_new_light ( txt_con_org , contours_only_text_parent , textline_mask_tot_ea_org , image_page_rotated , boxes_text , slope_deskew )
slopes_marginals , all_found_textline_polygons_marginals , boxes_marginals , _ , polygons_of_marginals , all_box_coord_marginals , _ = self . get_slopes_and_deskew_new_light ( polygons_of_marginals , polygons_of_marginals , textline_mask_tot_ea_org , image_page_rotated , boxes_marginals , slope_deskew )
else :
slopes , all_found_textline_polygons , boxes_text , txt_con_org , contours_only_text_parent , all_box_coord , index_by_text_par_con = self . get_slopes_and_deskew_new_light ( txt_con_org , contours_only_text_parent , textline_mask_tot_ea , image_page_rotated , boxes_text , slope_deskew )
slopes_marginals , all_found_textline_polygons_marginals , boxes_marginals , _ , polygons_of_marginals , all_box_coord_marginals , _ = self . get_slopes_and_deskew_new_light ( polygons_of_marginals , polygons_of_marginals , textline_mask_tot_ea , image_page_rotated , boxes_marginals , slope_deskew )
else :
text_regions_p , contours_only_text_parent , contours_only_text_parent_h , all_box_coord , all_box_coord_h , all_found_textline_polygons , all_found_textline_polygons_h , slopes , slopes_h , contours_only_text_parent_d_ordered , contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header ( text_regions_p , regions_fully , contours_only_text_parent , all_box_coord , all_found_textline_polygons , slopes , contours_only_text_parent_d_ordered )
slopes , all_found_textline_polygons , boxes_text , txt_con_org , contours_only_text_parent , all_box_coord , index_by_text_par_con = self . get_slopes_and_deskew_new ( txt_con_org , contours_only_text_parent , textline_mask_tot_ea , image_page_rotated , boxes_text , slope_deskew )
slopes_marginals , all_found_textline_polygons_marginals , boxes_marginals , _ , polygons_of_marginals , all_box_coord_marginals , _ = self . get_slopes_and_deskew_new ( polygons_of_marginals , polygons_of_marginals , textline_mask_tot_ea , image_page_rotated , boxes_marginals , slope_deskew )
else :
#takes long timee
contours_only_text_parent_d_ordered = None
if self . light_version :
text_regions_p , contours_only_text_parent , contours_only_text_parent_h , all_box_coord , all_box_coord_h , all_found_textline_polygons , all_found_textline_polygons_h , slopes , slopes_h , contours_only_text_parent_d_ordered , contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header_light ( text_regions_p , regions_fully , contours_only_text_parent , all_box_coord , all_found_textline_polygons , slopes , contours_only_text_parent_d_ordered )
scale_param = 1
all_found_textline_polygons , boxes_text , txt_con_org , contours_only_text_parent , all_box_coord , index_by_text_par_con , slopes = self . get_slopes_and_deskew_new_curved ( txt_con_org , contours_only_text_parent , cv2 . erode ( textline_mask_tot_ea , kernel = KERNEL , iterations = 1 ) , image_page_rotated , boxes_text , text_only , num_col_classifier , scale_param , slope_deskew )
all_found_textline_polygons = small_textlines_to_parent_adherence2 ( all_found_textline_polygons , textline_mask_tot_ea , num_col_classifier )
all_found_textline_polygons_marginals , boxes_marginals , _ , polygons_of_marginals , all_box_coord_marginals , _ , slopes_marginals = self . get_slopes_and_deskew_new_curved ( polygons_of_marginals , polygons_of_marginals , cv2 . erode ( textline_mask_tot_ea , kernel = KERNEL , iterations = 1 ) , image_page_rotated , boxes_marginals , text_only , num_col_classifier , scale_param , slope_deskew )
all_found_textline_polygons_marginals = small_textlines_to_parent_adherence2 ( all_found_textline_polygons_marginals , textline_mask_tot_ea , num_col_classifier )
if self . full_layout :
if np . abs ( slope_deskew ) > = SLOPE_THRESHOLD :
contours_only_text_parent_d_ordered = list ( np . array ( contours_only_text_parent_d_ordered , dtype = object ) [ index_by_text_par_con ] )
if self . light_version :
text_regions_p , contours_only_text_parent , contours_only_text_parent_h , all_box_coord , all_box_coord_h , all_found_textline_polygons , all_found_textline_polygons_h , slopes , slopes_h , contours_only_text_parent_d_ordered , contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header_light ( text_regions_p , regions_fully , contours_only_text_parent , all_box_coord , all_found_textline_polygons , slopes , contours_only_text_parent_d_ordered )
else :
text_regions_p , contours_only_text_parent , contours_only_text_parent_h , all_box_coord , all_box_coord_h , all_found_textline_polygons , all_found_textline_polygons_h , slopes , slopes_h , contours_only_text_parent_d_ordered , contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header ( text_regions_p , regions_fully , contours_only_text_parent , all_box_coord , all_found_textline_polygons , slopes , contours_only_text_parent_d_ordered )
else :
text_regions_p , contours_only_text_parent , contours_only_text_parent_h , all_box_coord , all_box_coord_h , all_found_textline_polygons , all_found_textline_polygons_h , slopes , slopes_h , contours_only_text_parent_d_ordered , contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header ( text_regions_p , regions_fully , contours_only_text_parent , all_box_coord , all_found_textline_polygons , slopes , contours_only_text_parent_d_ordered )
#takes long timee
contours_only_text_parent_d_ordered = None
if self . light_version :
text_regions_p , contours_only_text_parent , contours_only_text_parent_h , all_box_coord , all_box_coord_h , all_found_textline_polygons , all_found_textline_polygons_h , slopes , slopes_h , contours_only_text_parent_d_ordered , contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header_light ( text_regions_p , regions_fully , contours_only_text_parent , all_box_coord , all_found_textline_polygons , slopes , contours_only_text_parent_d_ordered )
else :
text_regions_p , contours_only_text_parent , contours_only_text_parent_h , all_box_coord , all_box_coord_h , all_found_textline_polygons , all_found_textline_polygons_h , slopes , slopes_h , contours_only_text_parent_d_ordered , contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header ( text_regions_p , regions_fully , contours_only_text_parent , all_box_coord , all_found_textline_polygons , slopes , contours_only_text_parent_d_ordered )
if self . plotter :
self . plotter . save_plot_of_layout ( text_regions_p , image_page )
self . plotter . save_plot_of_layout_all ( text_regions_p , image_page )
pixel_img = 4
polygons_of_drop_capitals = return_contours_of_interested_region_by_min_size ( text_regions_p , pixel_img )
all_found_textline_polygons = adhere_drop_capital_region_into_corresponding_textline ( text_regions_p , polygons_of_drop_capitals , contours_only_text_parent , contours_only_text_parent_h , all_box_coord , all_box_coord_h , all_found_textline_polygons , all_found_textline_polygons_h , kernel = KERNEL , curved_line = self . curved_line )
pixel_lines = 6
if not self . headers_off :
if np . abs ( slope_deskew ) < SLOPE_THRESHOLD :
num_col , _ , matrix_of_lines_ch , splitter_y_new , _ = find_number_of_columns_in_document ( np . repeat ( text_regions_p [ : , : , np . newaxis ] , 3 , axis = 2 ) , num_col_classifier , self . tables , pixel_lines , contours_only_text_parent_h )
else :
_ , _ , matrix_of_lines_ch_d , splitter_y_new_d , _ = find_number_of_columns_in_document ( np . repeat ( text_regions_p_1_n [ : , : , np . newaxis ] , 3 , axis = 2 ) , num_col_classifier , self . tables , pixel_lines , contours_only_text_parent_h_d_ordered )
elif self . headers_off :
if np . abs ( slope_deskew ) < SLOPE_THRESHOLD :
num_col , _ , matrix_of_lines_ch , splitter_y_new , _ = find_number_of_columns_in_document ( np . repeat ( text_regions_p [ : , : , np . newaxis ] , 3 , axis = 2 ) , num_col_classifier , self . tables , pixel_lines )
else :
_ , _ , matrix_of_lines_ch_d , splitter_y_new_d , _ = find_number_of_columns_in_document ( np . repeat ( text_regions_p_1_n [ : , : , np . newaxis ] , 3 , axis = 2 ) , num_col_classifier , self . tables , pixel_lines )
if num_col_classifier > = 3 :
if np . abs ( slope_deskew ) < SLOPE_THRESHOLD :
regions_without_separators = regions_without_separators . astype ( np . uint8 )
regions_without_separators = cv2 . erode ( regions_without_separators [ : , : ] , KERNEL , iterations = 6 )
else :
regions_without_separators_d = regions_without_separators_d . astype ( np . uint8 )
regions_without_separators_d = cv2 . erode ( regions_without_separators_d [ : , : ] , KERNEL , iterations = 6 )
if self . plotter :
self . plotter . save_plot_of_layout ( text_regions_p , image_page )
self . plotter . save_plot_of_layout_all ( text_regions_p , image_page )
pixel_img = 4
polygons_of_drop_capitals = return_contours_of_interested_region_by_min_size ( text_regions_p , pixel_img )
all_found_textline_polygons = adhere_drop_capital_region_into_corresponding_textline ( text_regions_p , polygons_of_drop_capitals , contours_only_text_parent , contours_only_text_parent_h , all_box_coord , all_box_coord_h , all_found_textline_polygons , all_found_textline_polygons_h , kernel = KERNEL , curved_line = self . curved_line )
pixel_lines = 6
if not self . headers_off :
if np . abs ( slope_deskew ) < SLOPE_THRESHOLD :
num_col , _ , matrix_of_lines_ch , splitter_y_new , _ = find_number_of_columns_in_document ( np . repeat ( text_regions_p [ : , : , np . newaxis ] , 3 , axis = 2 ) , num_col_classifier , self . tables , pixel_lines , contours_only_text_parent_h )
boxes , peaks_neg_tot_tables = return_boxes_of_images_by_order_of_reading_new ( splitter_y_new , regions_without_separators , matrix_of_lines_ch , num_col_classifier , erosion_hurts , self . tables , self . right2left )
else :
_ , _ , matrix_of_lines_ch_d , splitter_y_new_d , _ = find_number_of_columns_in_document ( np . repeat ( text_regions_p_1_n [ : , : , np . newaxis ] , 3 , axis = 2 ) , num_col_classifier , self . tables , pixel_lines , contours_only_text_parent_h_d_ordered )
elif self . headers_off :
boxes_d , peaks_neg_tot_tables_d = return_boxes_of_images_by_order_of_reading_new ( splitter_y_new_d , regions_without_separators_d , matrix_of_lines_ch_d , num_col_classifier , erosion_hurts , self . tables , self . right2left )
#print(boxes_d,'boxes_d')
#img_once = np.zeros((textline_mask_tot_d.shape[0],textline_mask_tot_d.shape[1]))
#for box_i in boxes_d:
#img_once[int(box_i[2]):int(box_i[3]),int(box_i[0]):int(box_i[1]) ] =1
#plt.imshow(img_once)
#plt.show()
#print(np.unique(img_once),'img_once')
if self . plotter :
self . plotter . write_images_into_directory ( polygons_of_images , image_page )
t_order = time . time ( )
if self . full_layout :
if np . abs ( slope_deskew ) < SLOPE_THRESHOLD :
num_col , _ , matrix_of_lines_ch , splitter_y_new , _ = find_number_of_columns_in_document ( np . repeat ( text_regions_p [ : , : , np . newaxis ] , 3 , axis = 2 ) , num_col_classifier , self . tables , pixel_lines )
order_text_new, id_of_texts_tot = self . do_order_of_regions ( contours_only_text_parent , contours_only_text_parent_h , boxes , textline_mask_tot )
else :
_ , _ , matrix_of_lines_ch_d , splitter_y_new_d , _ = find_number_of_columns_in_document ( np . repeat ( text_regions_p_1_n [ : , : , np . newaxis ] , 3 , axis = 2 ) , num_col_classifier , self . tables , pixel_lines )
if num_col_classifier > = 3 :
if np . abs ( slope_deskew ) < SLOPE_THRESHOLD :
regions_without_separators = regions_without_separators . astype ( np . uint8 )
regions_without_separators = cv2 . erode ( regions_without_separators [ : , : ] , KERNEL , iterations = 6 )
order_text_new , id_of_texts_tot = self . do_order_of_regions ( contours_only_text_parent_d_ordered , contours_only_text_parent_h_d_ordered , boxes_d , textline_mask_tot_d )
else :
regions_without_separators_d = regions_without_separators_d . astype ( np . uint8 )
regions_without_separators_d = cv2 . erode ( regions_without_separators_d [ : , : ] , KERNEL , iterations = 6 )
pcgts = self . writer . build_pagexml_full_layout ( contours_only_text_parent , contours_only_text_parent_h , page_coord , order_text_new , id_of_texts_tot , all_found_textline_polygons , all_found_textline_polygons_h , all_box_coord , all_box_coord_h , polygons_of_images , contours_tables , polygons_of_drop_capitals , polygons_of_marginals , all_found_textline_polygons_marginals , all_box_coord_marginals , slopes , slopes_h , slopes_marginals , cont_page , polygons_lines_xml )
self . logger . info ( " Job done in %.1f s " , time . time ( ) - t0 )
if np . abs ( slope_deskew ) < SLOPE_THRESHOLD :
boxes , peaks_neg_tot_tables = return_boxes_of_images_by_order_of_reading_new ( splitter_y_new , regions_without_separators , matrix_of_lines_ch , num_col_classifier , erosion_hurts , self . tables , self . right2left )
else :
boxes_d , peaks_neg_tot_tables_d = return_boxes_of_images_by_order_of_reading_new ( splitter_y_new_d , regions_without_separators_d , matrix_of_lines_ch_d , num_col_classifier , erosion_hurts , self . tables , self . right2left )
#print(boxes_d,'boxes_d')
#img_once = np.zeros((textline_mask_tot_d.shape[0],textline_mask_tot_d.shape[1]))
#for box_i in boxes_d:
#img_once[int(box_i[2]):int(box_i[3]),int(box_i[0]):int(box_i[1]) ] =1
#plt.imshow(img_once)
#plt.show()
#print(np.unique(img_once),'img_once')
if self . plotter :
self . plotter . write_images_into_directory ( polygons_of_images , image_page )
t_order = time . time ( )
if self . full_layout :
if np . abs ( slope_deskew ) < SLOPE_THRESHOLD :
order_text_new , id_of_texts_tot = self . do_order_of_regions ( contours_only_text_parent , contours_only_text_parent_h , boxes , textline_mask_tot )
if not self . dir_in :
return pcgts
else :
order_text_new , id_of_texts_tot = self . do_order_of_regions ( contours_only_text_parent_d_ordered , contours_only_text_parent_h_d_ordered , boxes_d , textline_mask_tot_d )
contours_only_text_parent_h = None
if np . abs ( slope_deskew ) < SLOPE_THRESHOLD :
order_text_new , id_of_texts_tot = self . do_order_of_regions ( contours_only_text_parent , contours_only_text_parent_h , boxes , textline_mask_tot )
else :
contours_only_text_parent_d_ordered = list ( np . array ( contours_only_text_parent_d_ordered , dtype = object ) [ index_by_text_par_con ] )
order_text_new , id_of_texts_tot = self . do_order_of_regions ( contours_only_text_parent_d_ordered , contours_only_text_parent_h , boxes_d , textline_mask_tot_d )
pcgts = self . writer . build_pagexml_no_full_layout ( txt_con_org , page_coord , order_text_new , id_of_texts_tot , all_found_textline_polygons , all_box_coord , polygons_of_images , polygons_of_marginals , all_found_textline_polygons_marginals , all_box_coord_marginals , slopes , slopes_marginals , cont_page , polygons_lines_xml , contours_tables )
self . logger . info ( " Job done in %.1f s " , time . time ( ) - t0 )
if not self . dir_in :
return pcgts
pcgts = self . writer . build_pagexml_full_layout ( contours_only_text_parent , contours_only_text_parent_h , page_coord , order_text_new , id_of_texts_tot , all_found_textline_polygons , all_found_textline_polygons_h , all_box_coord , all_box_coord_h , polygons_of_images , contours_tables , polygons_of_drop_capitals , polygons_of_marginals , all_found_textline_polygons_marginals , all_box_coord_marginals , slopes , slopes_h , slopes_marginals , cont_page , polygons_lines_xml )
self . logger . info ( " Job done in %.1f s " , time . time ( ) - t0 )
if not self . dir_in :
return pcgts
else :
contours_only_text_parent_h = None
if np . abs ( slope_deskew ) < SLOPE_THRESHOLD :
order_text_new , id_of_texts_tot = self . do_order_of_regions ( contours_only_text_parent , contours_only_text_parent_h , boxes , textline_mask_tot )
else :
contours_only_text_parent_d_ordered = list ( np . array ( contours_only_text_parent_d_ordered , dtype = object ) [ index_by_text_par_con ] )
order_text_new , id_of_texts_tot = self . do_order_of_regions ( contours_only_text_parent_d_ordered , contours_only_text_parent_h , boxes_d , textline_mask_tot_d )
pcgts = self . writer . build_pagexml_no_full_layout ( txt_con_org , page_coord , order_text_new , id_of_texts_tot , all_found_textline_polygons , all_box_coord , polygons_of_images , polygons_of_marginals , all_found_textline_polygons_marginals , all_box_coord_marginals , slopes , slopes_marginals , cont_page , polygons_lines_xml , contours_tables )
self . logger . info ( " Job done in %.1f s " , time . time ( ) - t0 )
if not self . dir_in :
return pcgts
if self . dir_in :
self . writer . write_pagexml ( pcgts )
#self.logger.info("Job done in %.1fs", time.time() - t0)
if self . dir_in :
self . writer . write_pagexml ( pcgts )
#self.logger.info("Job done in %.1fs", time.time() - t0)
if self . dir_in :
self . logger . info ( " All jobs done in %.1f s " , time . time ( ) - t0_tot )
self . logger . info ( " All jobs done in %.1f s " , time . time ( ) - t0_tot )