extracting images only - avoid artifacts with heuristics

pull/132/head
vahidrezanezhad 3 months ago
parent 9170a9f21c
commit 8e2cdad1be

@ -1731,11 +1731,14 @@ class Eynollah:
polygons_of_images_fin.append(np.array([[page_coord_img[2], page_coord_img[0]], [page_coord_img[3], page_coord_img[0]], [page_coord_img[3], page_coord_img[1]], [page_coord_img[2], page_coord_img[1]]]) )
"""
x, y, w, h = cv2.boundingRect(ploy_img_ind)
box = [x, y, w, h]
_, page_coord_img = crop_image_inside_box(box, text_regions_p_true)
#cont_page.append(np.array([[page_coord[2], page_coord[0]], [page_coord[3], page_coord[0]], [page_coord[3], page_coord[1]], [page_coord[2], page_coord[1]]]))
polygons_of_images_fin.append(np.array([[page_coord_img[2], page_coord_img[0]], [page_coord_img[3], page_coord_img[0]], [page_coord_img[3], page_coord_img[1]], [page_coord_img[2], page_coord_img[1]]]) )
if h < 150 or w < 150:
pass
else:
box = [x, y, w, h]
_, page_coord_img = crop_image_inside_box(box, text_regions_p_true)
#cont_page.append(np.array([[page_coord[2], page_coord[0]], [page_coord[3], page_coord[0]], [page_coord[3], page_coord[1]], [page_coord[2], page_coord[1]]]))
polygons_of_images_fin.append(np.array([[page_coord_img[2], page_coord_img[0]], [page_coord_img[3], page_coord_img[0]], [page_coord_img[3], page_coord_img[1]], [page_coord_img[2], page_coord_img[1]]]) )
return text_regions_p_true, erosion_hurts, polygons_lines_xml, polygons_of_images_fin, image_page, page_coord, cont_page
def get_regions_light_v(self,img,is_image_enhanced, num_col_classifier):
@ -3011,7 +3014,7 @@ class Eynollah:
pcgts = self.writer.build_pagexml_no_full_layout([], page_coord, [], [], [], [], polygons_of_images, [], [], [], [], [], cont_page, [], [])
if self.plotter:
self.plotter.write_images_into_directory(polygons_of_images, img_res)
self.plotter.write_images_into_directory(polygons_of_images, image_page)
#plt.imshow(text_regions_p_1)
#plt.show()

@ -0,0 +1,33 @@
import os
import sys
dir_ppn = '/home/vahid/Documents/eynollah/ppn_list.txt'
with open(dir_ppn) as f:
ppn_list = f.readlines()
ppn_list = [ind.split('\n')[0] for ind in ppn_list]
url_main = 'https://content.staatsbibliothek-berlin.de/dc/download/zip?ppn=PPN'
out_result = './new_results_ppns2'
for ppn_ind in ppn_list:
url = url_main + ppn_ind
#curl -o ./ppn.zip "https://content.staatsbibliothek-berlin.de/dc/download/zip?ppn=PPN1762638355"
os.system("curl -o "+"./PPN_"+ppn_ind+".zip"+" "+url)
os.system("unzip "+"PPN_"+ppn_ind+".zip"+ " -d "+"PPN_"+ppn_ind)
os.system("rm -rf "+"PPN_"+ppn_ind+"/*.txt")
os.system("mkdir "+out_result+'/'+"PPN_"+ppn_ind+"_out")
os.system("mkdir "+out_result+'/'+"PPN_"+ppn_ind+"_out_images")
command_eynollah = "eynollah -m /home/vahid/Downloads/models_eynollah_renamed_savedmodel -di "+"PPN_"+ppn_ind+" "+"-o "+out_result+'/'+"PPN_"+ppn_ind+"_out "+"-eoi "+"-ep -si "+out_result+'/'+"PPN_"+ppn_ind+"_out_images"
os.system(command_eynollah)
os.system("rm -rf "+"PPN_"+ppn_ind+".zip")
os.system("rm -rf "+"PPN_"+ppn_ind)
#sys.exit()
Loading…
Cancel
Save