|
|
@ -2066,89 +2066,96 @@ class textline_detector:
|
|
|
|
gc.collect()
|
|
|
|
gc.collect()
|
|
|
|
t2=time.time()
|
|
|
|
t2=time.time()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
# extract text regions and corresponding contours and surrounding box
|
|
|
|
try:
|
|
|
|
text_regions=self.extract_text_regions(image_page)
|
|
|
|
# extract text regions and corresponding contours and surrounding box
|
|
|
|
|
|
|
|
text_regions=self.extract_text_regions(image_page)
|
|
|
|
|
|
|
|
|
|
|
|
text_regions = cv2.erode(text_regions, self.kernel, iterations=3)
|
|
|
|
text_regions = cv2.erode(text_regions, self.kernel, iterations=3)
|
|
|
|
text_regions = cv2.dilate(text_regions, self.kernel, iterations=4)
|
|
|
|
text_regions = cv2.dilate(text_regions, self.kernel, iterations=4)
|
|
|
|
|
|
|
|
|
|
|
|
#plt.imshow(text_regions[:,:,0])
|
|
|
|
#plt.imshow(text_regions[:,:,0])
|
|
|
|
#plt.show()
|
|
|
|
#plt.show()
|
|
|
|
|
|
|
|
|
|
|
|
contours=self.get_text_region_contours_and_boxes(text_regions)
|
|
|
|
contours=self.get_text_region_contours_and_boxes(text_regions)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
##########
|
|
|
|
##########
|
|
|
|
K.clear_session()
|
|
|
|
K.clear_session()
|
|
|
|
gc.collect()
|
|
|
|
gc.collect()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
except:
|
|
|
|
except:
|
|
|
|
text_regions=None
|
|
|
|
text_regions=None
|
|
|
|
contours=[]
|
|
|
|
contours=[]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
t3=time.time()
|
|
|
|
t3=time.time()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if len(contours)>0:
|
|
|
|
if len(contours)>0:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# extracting textlines using segmentation
|
|
|
|
# extracting textlines using segmentation
|
|
|
|
textline_mask_tot=self.textline_contours(image_page)
|
|
|
|
textline_mask_tot=self.textline_contours(image_page)
|
|
|
|
##########
|
|
|
|
##########
|
|
|
|
K.clear_session()
|
|
|
|
K.clear_session()
|
|
|
|
gc.collect()
|
|
|
|
gc.collect()
|
|
|
|
|
|
|
|
|
|
|
|
t4=time.time()
|
|
|
|
t4=time.time()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# calculate the slope for deskewing for each box of text region.
|
|
|
|
# calculate the slope for deskewing for each box of text region.
|
|
|
|
contours=self.get_slopes_and_deskew(contours,textline_mask_tot)
|
|
|
|
contours=self.get_slopes_and_deskew(contours,textline_mask_tot)
|
|
|
|
|
|
|
|
|
|
|
|
gc.collect()
|
|
|
|
gc.collect()
|
|
|
|
t5=time.time()
|
|
|
|
t5=time.time()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# get orders of each textregion. This method by now only works for one column documents.
|
|
|
|
# get orders of each textregion. This method by now only works for one column documents.
|
|
|
|
indexes_sorted, matrix_of_orders=self.order_of_regions(textline_mask_tot,contours)
|
|
|
|
indexes_sorted, matrix_of_orders=self.order_of_regions(textline_mask_tot,contours)
|
|
|
|
order_of_texts, id_of_texts=self.order_and_id_of_texts(contours ,matrix_of_orders ,indexes_sorted )
|
|
|
|
order_of_texts, id_of_texts=self.order_and_id_of_texts(contours ,matrix_of_orders ,indexes_sorted )
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
##########
|
|
|
|
##########
|
|
|
|
gc.collect()
|
|
|
|
gc.collect()
|
|
|
|
t6=time.time()
|
|
|
|
t6=time.time()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
self.get_all_image_patches_coordination(image_page)
|
|
|
|
self.get_all_image_patches_coordination(image_page)
|
|
|
|
|
|
|
|
|
|
|
|
##########
|
|
|
|
##########
|
|
|
|
##########
|
|
|
|
##########
|
|
|
|
gc.collect()
|
|
|
|
gc.collect()
|
|
|
|
|
|
|
|
|
|
|
|
t7=time.time()
|
|
|
|
t7=time.time()
|
|
|
|
|
|
|
|
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
|
|
|
|
contours=[]
|
|
|
|
|
|
|
|
order_of_texts=None
|
|
|
|
|
|
|
|
id_of_texts=None
|
|
|
|
|
|
|
|
self.write_into_page_xml(contours,page_coord,self.dir_out , order_of_texts , id_of_texts)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Destroy the current Keras session/graph to free memory
|
|
|
|
|
|
|
|
K.clear_session()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print( "time total = "+"{0:.2f}".format(time.time()-t1) )
|
|
|
|
|
|
|
|
print( "time needed for page extraction = "+"{0:.2f}".format(t2-t1) )
|
|
|
|
|
|
|
|
print( "time needed for text region extraction and get contours = "+"{0:.2f}".format(t3-t2) )
|
|
|
|
|
|
|
|
if len(contours)>0:
|
|
|
|
|
|
|
|
print( "time needed for textlines = "+"{0:.2f}".format(t4-t3) )
|
|
|
|
|
|
|
|
print( "time needed to get slopes of regions (deskewing) = "+"{0:.2f}".format(t5-t4) )
|
|
|
|
|
|
|
|
print( "time needed to get order of regions = "+"{0:.2f}".format(t6-t5) )
|
|
|
|
|
|
|
|
print( "time needed to implement deskewing = "+"{0:.2f}".format(t7-t6) )
|
|
|
|
|
|
|
|
except:
|
|
|
|
contours=[]
|
|
|
|
contours=[]
|
|
|
|
order_of_texts=None
|
|
|
|
order_of_texts=None
|
|
|
|
id_of_texts=None
|
|
|
|
id_of_texts=None
|
|
|
|
self.write_into_page_xml(contours,page_coord,self.dir_out , order_of_texts , id_of_texts)
|
|
|
|
self.write_into_page_xml(contours,page_coord,self.dir_out , order_of_texts , id_of_texts)
|
|
|
|
|
|
|
|
print( "time total = "+"{0:.2f}".format(time.time()-t1) )
|
|
|
|
# Destroy the current Keras session/graph to free memory
|
|
|
|
|
|
|
|
K.clear_session()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print( "time total = "+"{0:.2f}".format(time.time()-t1) )
|
|
|
|
|
|
|
|
print( "time needed for page extraction = "+"{0:.2f}".format(t2-t1) )
|
|
|
|
|
|
|
|
print( "time needed for text region extraction and get contours = "+"{0:.2f}".format(t3-t2) )
|
|
|
|
|
|
|
|
if len(contours)>0:
|
|
|
|
|
|
|
|
print( "time needed for textlines = "+"{0:.2f}".format(t4-t3) )
|
|
|
|
|
|
|
|
print( "time needed to get slopes of regions (deskewing) = "+"{0:.2f}".format(t5-t4) )
|
|
|
|
|
|
|
|
print( "time needed to get order of regions = "+"{0:.2f}".format(t6-t5) )
|
|
|
|
|
|
|
|
print( "time needed to implement deskewing = "+"{0:.2f}".format(t7-t6) )
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|