From 8e2cdad1be6c7ad6577f495eab22495671f4428c Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Wed, 7 Aug 2024 23:22:27 +0200 Subject: [PATCH] extracting images only - avoid artifacts with heuristics --- qurator/eynollah/eynollah.py | 15 +++++++----- run_image_extraction_over_ppn_lists.py | 33 ++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 6 deletions(-) create mode 100644 run_image_extraction_over_ppn_lists.py diff --git a/qurator/eynollah/eynollah.py b/qurator/eynollah/eynollah.py index a5d7b38..6c3fa3e 100644 --- a/qurator/eynollah/eynollah.py +++ b/qurator/eynollah/eynollah.py @@ -1731,11 +1731,14 @@ class Eynollah: polygons_of_images_fin.append(np.array([[page_coord_img[2], page_coord_img[0]], [page_coord_img[3], page_coord_img[0]], [page_coord_img[3], page_coord_img[1]], [page_coord_img[2], page_coord_img[1]]]) ) """ x, y, w, h = cv2.boundingRect(ploy_img_ind) - box = [x, y, w, h] - _, page_coord_img = crop_image_inside_box(box, text_regions_p_true) - #cont_page.append(np.array([[page_coord[2], page_coord[0]], [page_coord[3], page_coord[0]], [page_coord[3], page_coord[1]], [page_coord[2], page_coord[1]]])) - - polygons_of_images_fin.append(np.array([[page_coord_img[2], page_coord_img[0]], [page_coord_img[3], page_coord_img[0]], [page_coord_img[3], page_coord_img[1]], [page_coord_img[2], page_coord_img[1]]]) ) + if h < 150 or w < 150: + pass + else: + box = [x, y, w, h] + _, page_coord_img = crop_image_inside_box(box, text_regions_p_true) + #cont_page.append(np.array([[page_coord[2], page_coord[0]], [page_coord[3], page_coord[0]], [page_coord[3], page_coord[1]], [page_coord[2], page_coord[1]]])) + + polygons_of_images_fin.append(np.array([[page_coord_img[2], page_coord_img[0]], [page_coord_img[3], page_coord_img[0]], [page_coord_img[3], page_coord_img[1]], [page_coord_img[2], page_coord_img[1]]]) ) return text_regions_p_true, erosion_hurts, polygons_lines_xml, polygons_of_images_fin, image_page, page_coord, cont_page def get_regions_light_v(self,img,is_image_enhanced, num_col_classifier): @@ -3011,7 +3014,7 @@ class Eynollah: pcgts = self.writer.build_pagexml_no_full_layout([], page_coord, [], [], [], [], polygons_of_images, [], [], [], [], [], cont_page, [], []) if self.plotter: - self.plotter.write_images_into_directory(polygons_of_images, img_res) + self.plotter.write_images_into_directory(polygons_of_images, image_page) #plt.imshow(text_regions_p_1) #plt.show() diff --git a/run_image_extraction_over_ppn_lists.py b/run_image_extraction_over_ppn_lists.py new file mode 100644 index 0000000..a890022 --- /dev/null +++ b/run_image_extraction_over_ppn_lists.py @@ -0,0 +1,33 @@ +import os +import sys + +dir_ppn = '/home/vahid/Documents/eynollah/ppn_list.txt' + + +with open(dir_ppn) as f: + ppn_list = f.readlines() + + +ppn_list = [ind.split('\n')[0] for ind in ppn_list] + +url_main = 'https://content.staatsbibliothek-berlin.de/dc/download/zip?ppn=PPN' + +out_result = './new_results_ppns2' + + +for ppn_ind in ppn_list: + url = url_main + ppn_ind + #curl -o ./ppn.zip "https://content.staatsbibliothek-berlin.de/dc/download/zip?ppn=PPN1762638355" + os.system("curl -o "+"./PPN_"+ppn_ind+".zip"+" "+url) + os.system("unzip "+"PPN_"+ppn_ind+".zip"+ " -d "+"PPN_"+ppn_ind) + os.system("rm -rf "+"PPN_"+ppn_ind+"/*.txt") + + os.system("mkdir "+out_result+'/'+"PPN_"+ppn_ind+"_out") + os.system("mkdir "+out_result+'/'+"PPN_"+ppn_ind+"_out_images") + command_eynollah = "eynollah -m /home/vahid/Downloads/models_eynollah_renamed_savedmodel -di "+"PPN_"+ppn_ind+" "+"-o "+out_result+'/'+"PPN_"+ppn_ind+"_out "+"-eoi "+"-ep -si "+out_result+'/'+"PPN_"+ppn_ind+"_out_images" + os.system(command_eynollah) + + os.system("rm -rf "+"PPN_"+ppn_ind+".zip") + os.system("rm -rf "+"PPN_"+ppn_ind) + #sys.exit() +