Merge pull request #132 from qurator-spk/extracting_images_only

Extracting images only
2025-10-07 06:59:58 +02:00 · 2024-09-20 09:35:40 +02:00 · 2024-09-20 09:35:40 +02:00 · 4af0bc079c
commit 4af0bc079c
parent 256a7c347f d168edfd77
6 changed files with 469 additions and 268 deletions
--- a/7
+++ b/7
@ -22,17 +22,14 @@ help:
 models: models_eynollah
 models_eynollah: models_eynollah.tar.gz
 	# tar xf models_eynollah_renamed.tar.gz --transform 's/models_eynollah_renamed/models_eynollah/'
 	# tar xf models_eynollah_renamed.tar.gz
 	# tar xf models_eynollah_renamed_savedmodel.tar.gz --transform 's/models_eynollah_renamed_savedmodel/models_eynollah/'
 	tar xf models_eynollah.tar.gz
 models_eynollah.tar.gz:
 	# wget 'https://qurator-data.de/eynollah/2021-04-25/models_eynollah.tar.gz'
 	# wget 'https://qurator-data.de/eynollah/2022-04-05/models_eynollah_renamed.tar.gz'
 	# wget 'https://ocr-d.kba.cloud/2022-04-05.SavedModel.tar.gz'
 	# wget 'https://qurator-data.de/eynollah/2022-04-05/models_eynollah_renamed_savedmodel.tar.gz'
-	wget https://github.com/qurator-spk/eynollah/releases/download/v0.3.0/models_eynollah.tar.gz
+	# wget 'https://github.com/qurator-spk/eynollah/releases/download/v0.3.0/models_eynollah.tar.gz'
 	wget 'https://github.com/qurator-spk/eynollah/releases/download/v0.3.1/models_eynollah.tar.gz'
 # Install with pip
 install:
--- a/README.md
+++ b/README.md
@ -71,6 +71,7 @@ The following options can be used to further configure the processing:
 | `-cl`             | apply contour detection for curved text lines instead of bounding boxes        |
 | `-ib`             | apply binarization (the resulting image is saved to the output directory)      |
 | `-ep`             | enable plotting (MUST always be used with `-sl`, `-sd`, `-sa`, `-si` or `-ae`) |
 | `-eoi`            | extract only images to output directory (other processing will not be done)    |
 | `-ho`             | ignore headers for reading order dectection                                    |
 | `-si <directory>` | save image regions detected to this directory                                  |
 | `-sd <directory>` | save deskewed image to this directory                                          |
--- a/src/eynollah/cli.py
+++ b/src/eynollah/cli.py
@ -67,6 +67,12 @@ from eynollah.eynollah import Eynollah
    is_flag=True,
    help="If set, will plot intermediary files and images",
 )
@click.option(
    "--extract_only_images/--disable-extracting_only_images",
    "-eoi/-noeoi",
    is_flag=True,
    help="If a directory is given, only images in documents will be cropped and saved there and the other processing will not be done",
 )
@click.option(
    "--allow-enhancement/--no-allow-enhancement",
    "-ae/-noae",
@ -148,6 +154,7 @@ def main(
    save_layout,
    save_deskewed,
    save_all,
    extract_only_images,
    save_page,
    enable_plotting,
    allow_enhancement,
@ -175,12 +182,16 @@ def main(
    if textline_light and not light_version:
        print('Error: You used -tll to enable light textline detection but -light is not enabled')
        sys.exit(1)
    if extract_only_images and  (allow_enhancement or allow_scaling or light_version or curved_line or textline_light or full_layout or tables or right2left or headers_off) :
        print('Error: You used -eoi which can not be enabled alongside light_version -light or allow_scaling -as or allow_enhancement -ae or curved_line -cl or textline_light -tll or full_layout -fl or tables -tab or right2left -r2l or headers_off -ho')
        sys.exit(1)
    eynollah = Eynollah(
        image_filename=image,
        dir_out=out,
        dir_in=dir_in,
        dir_models=model,
        dir_of_cropped_images=save_images,
        extract_only_images=extract_only_images,
        dir_of_layout=save_layout,
        dir_of_deskewed=save_deskewed,
        dir_of_all=save_all,
--- a/src/eynollah/eynollah.py
+++ b/src/eynollah/eynollah.py
@ -149,6 +149,7 @@ class Eynollah:
        dir_out=None,
        dir_in=None,
        dir_of_cropped_images=None,
        extract_only_images=False,
        dir_of_layout=None,
        dir_of_deskewed=None,
        dir_of_all=None,
@ -196,6 +197,7 @@ class Eynollah:
        self.allow_scaling = allow_scaling
        self.headers_off = headers_off
        self.light_version = light_version
        self.extract_only_images = extract_only_images
        self.ignore_page_extraction = ignore_page_extraction
        self.pcgts = pcgts
        if not dir_in:
@ -226,6 +228,7 @@ class Eynollah:
        self.model_page_dir = dir_models + "/eynollah-page-extraction_20210425"
        self.model_region_dir_p_ens = dir_models + "/eynollah-main-regions-ensembled_20210425"
        self.model_region_dir_p_ens_light = dir_models + "/eynollah-main-regions_20220314"
        self.model_region_dir_p_ens_light_only_images_extraction = dir_models + "/eynollah-main-regions_20231127_672_org_ens_11_13_16_17_18"
        if self.textline_light:
            self.model_textline_dir = dir_models + "/eynollah-textline_light_20210425"
        else:
@ -250,7 +253,23 @@ class Eynollah:
            self.ls_imgs  = os.listdir(self.dir_in)
-        if dir_in and not light_version:
+        if dir_in and self.extract_only_images:
            config = tf.compat.v1.ConfigProto()
            config.gpu_options.allow_growth = True
            session = tf.compat.v1.Session(config=config)
            set_session(session)
            self.model_page = self.our_load_model(self.model_page_dir)
            self.model_classifier = self.our_load_model(self.model_dir_of_col_classifier)
            self.model_bin = self.our_load_model(self.model_dir_of_binarization)
            #self.model_textline = self.our_load_model(self.model_textline_dir)
            self.model_region = self.our_load_model(self.model_region_dir_p_ens_light_only_images_extraction)
            #self.model_region_fl_np = self.our_load_model(self.model_region_dir_fully_np)
            #self.model_region_fl = self.our_load_model(self.model_region_dir_fully)
            self.ls_imgs  = os.listdir(self.dir_in)
        if dir_in and not (light_version or self.extract_only_images):
            config = tf.compat.v1.ConfigProto()
            config.gpu_options.allow_growth = True
            session = tf.compat.v1.Session(config=config)
@ -464,6 +483,27 @@ class Eynollah:
        return img_new, num_column_is_classified
    def calculate_width_height_by_columns_extract_only_images(self, img, num_col, width_early, label_p_pred):
        self.logger.debug("enter calculate_width_height_by_columns")
        if num_col == 1:
            img_w_new = 700
        elif num_col == 2:
            img_w_new = 900
        elif num_col == 3:
            img_w_new = 1500
        elif num_col == 4:
            img_w_new = 1800
        elif num_col == 5:
            img_w_new = 2200
        elif num_col == 6:
            img_w_new = 2500
        img_h_new = int(img.shape[0] / float(img.shape[1]) * img_w_new)
        img_new = resize_image(img, img_h_new, img_w_new)
        num_column_is_classified = True
        return img_new, num_column_is_classified
    def resize_image_with_column_classifier(self, is_image_enhanced, img_bin):
        self.logger.debug("enter resize_image_with_column_classifier")
        if self.input_binary:
@ -571,6 +611,7 @@ class Eynollah:
        self.logger.info("Found %d columns (%s)", num_col, np.around(label_p_pred, decimals=5))
        if not self.extract_only_images:
            if dpi < DPI_THRESHOLD:
                img_new, num_column_is_classified = self.calculate_width_height_by_columns(img, num_col, width_early, label_p_pred)
                if light_version:
@ -582,6 +623,10 @@ class Eynollah:
                num_column_is_classified = True
                image_res = np.copy(img)
                is_image_enhanced = False
        else:
            num_column_is_classified = True
            image_res = np.copy(img)
            is_image_enhanced = False
        self.logger.debug("exit resize_and_enhance_image_with_column_classifier")
        return is_image_enhanced, img, image_res, num_col, num_column_is_classified, img_bin
@ -868,7 +913,11 @@ class Eynollah:
                    seg_not_base = label_p_pred[0,:,:,4]
                    ##seg2 = -label_p_pred[0,:,:,2]
-                    
+                    if self.extract_only_images:
                        #seg_not_base[seg_not_base>0.3] =1
                        seg_not_base[seg_not_base>0.5] =1
                        seg_not_base[seg_not_base<1] =0
                    else:
                        seg_not_base[seg_not_base>0.03] =1
                        seg_not_base[seg_not_base<1] =0
@ -889,11 +938,8 @@ class Eynollah:
                    seg_line[seg_line>0.1] =1
                    seg_line[seg_line<1] =0
-                    
+                    if not self.extract_only_images:
                        seg_background = label_p_pred[0,:,:,0]
                    ##seg2 = -label_p_pred[0,:,:,2]
                        seg_background[seg_background>0.25] =1
                        seg_background[seg_background<1] =0
                    ##seg = seg+seg2
@ -908,7 +954,8 @@ class Eynollah:
                    ##plt.show()
                    #seg[seg==1]=0
                    #seg[seg_test==1]=1
-                    seg[seg_not_base==1]=4
+                    ###seg[seg_not_base==1]=4
                    if not self.extract_only_images:
                        seg[seg_background==1]=0
                    seg[(seg_line==1) & (seg==0)]=3
                    seg_color = np.repeat(seg[:, :, np.newaxis], 3, axis=2)
@ -1574,6 +1621,124 @@ class Eynollah:
        q.put(slopes_sub)
        poly.put(poly_sub)
        box_sub.put(boxes_sub_new)
    def get_regions_light_v_extract_only_images(self,img,is_image_enhanced, num_col_classifier):
        self.logger.debug("enter get_regions_extract_images_only")
        erosion_hurts = False
        img_org = np.copy(img)
        img_height_h = img_org.shape[0]
        img_width_h = img_org.shape[1]
        if num_col_classifier == 1:
            img_w_new = 700
        elif num_col_classifier == 2:
            img_w_new = 900
        elif num_col_classifier == 3:
            img_w_new = 1500
        elif num_col_classifier == 4:
            img_w_new = 1800
        elif num_col_classifier == 5:
            img_w_new = 2200
        elif num_col_classifier == 6:
            img_w_new = 2500
        img_h_new = int(img.shape[0] / float(img.shape[1]) * img_w_new)
        img_resized = resize_image(img,img_h_new, img_w_new )
        if not self.dir_in:
            model_region, session_region = self.start_new_session_and_model(self.model_region_dir_p_ens_light_only_images_extraction)
            prediction_regions_org = self.do_prediction_new_concept(True, img_resized, model_region)
        else:
            prediction_regions_org = self.do_prediction_new_concept(True, img_resized, self.model_region)
        #plt.imshow(prediction_regions_org[:,:,0])
        #plt.show()
        prediction_regions_org = resize_image(prediction_regions_org,img_height_h, img_width_h )
        image_page, page_coord, cont_page = self.extract_page()
        prediction_regions_org = prediction_regions_org[page_coord[0] : page_coord[1], page_coord[2] : page_coord[3]]
        prediction_regions_org=prediction_regions_org[:,:,0]
        mask_lines_only = (prediction_regions_org[:,:] ==3)*1
        mask_texts_only = (prediction_regions_org[:,:] ==1)*1
        mask_images_only=(prediction_regions_org[:,:] ==2)*1
        polygons_lines_xml, hir_lines_xml = return_contours_of_image(mask_lines_only)
        polygons_lines_xml = textline_con_fil = filter_contours_area_of_image(mask_lines_only, polygons_lines_xml, hir_lines_xml, max_area=1, min_area=0.00001)
        polygons_of_only_texts = return_contours_of_interested_region(mask_texts_only,1,0.00001)
        polygons_of_only_lines = return_contours_of_interested_region(mask_lines_only,1,0.00001)
        text_regions_p_true = np.zeros(prediction_regions_org.shape)
        text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts = polygons_of_only_lines, color=(3,3,3))
        text_regions_p_true[:,:][mask_images_only[:,:] == 1] = 2
        text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts = polygons_of_only_texts, color=(1,1,1))
        text_regions_p_true[text_regions_p_true.shape[0]-15:text_regions_p_true.shape[0], :] = 0
        text_regions_p_true[:, text_regions_p_true.shape[1]-15:text_regions_p_true.shape[1]] = 0
        ##polygons_of_images = return_contours_of_interested_region(text_regions_p_true, 2, 0.0001)
        polygons_of_images = return_contours_of_interested_region(text_regions_p_true, 2, 0.001)
        image_boundary_of_doc = np.zeros((text_regions_p_true.shape[0], text_regions_p_true.shape[1]))
        ###image_boundary_of_doc[:6, :] = 1
        ###image_boundary_of_doc[text_regions_p_true.shape[0]-6:text_regions_p_true.shape[0], :] = 1
        ###image_boundary_of_doc[:, :6] = 1
        ###image_boundary_of_doc[:, text_regions_p_true.shape[1]-6:text_regions_p_true.shape[1]] = 1
        #plt.imshow(image_boundary_of_doc)
        #plt.show()
        polygons_of_images_fin = []
        for ploy_img_ind in polygons_of_images:
            """
            test_poly_image = np.zeros((text_regions_p_true.shape[0], text_regions_p_true.shape[1]))
            test_poly_image = cv2.fillPoly(test_poly_image, pts = [ploy_img_ind], color=(1,1,1))
            test_poly_image = test_poly_image[:,:] + image_boundary_of_doc[:,:]
            test_poly_image_intersected_area = ( test_poly_image[:,:]==2 )*1
            test_poly_image_intersected_area = test_poly_image_intersected_area.sum()
            if test_poly_image_intersected_area==0:
                ##polygons_of_images_fin.append(ploy_img_ind)
                x, y, w, h = cv2.boundingRect(ploy_img_ind)
                box = [x, y, w, h]
                _, page_coord_img = crop_image_inside_box(box, text_regions_p_true)
                #cont_page.append(np.array([[page_coord[2], page_coord[0]], [page_coord[3], page_coord[0]], [page_coord[3], page_coord[1]], [page_coord[2], page_coord[1]]]))
                polygons_of_images_fin.append(np.array([[page_coord_img[2], page_coord_img[0]], [page_coord_img[3], page_coord_img[0]], [page_coord_img[3], page_coord_img[1]], [page_coord_img[2], page_coord_img[1]]]) )
            """
            x, y, w, h = cv2.boundingRect(ploy_img_ind)
            if h < 150 or w < 150:
                pass
            else:
                box = [x, y, w, h]
                _, page_coord_img = crop_image_inside_box(box, text_regions_p_true)
                #cont_page.append(np.array([[page_coord[2], page_coord[0]], [page_coord[3], page_coord[0]], [page_coord[3], page_coord[1]], [page_coord[2], page_coord[1]]]))
                polygons_of_images_fin.append(np.array([[page_coord_img[2], page_coord_img[0]], [page_coord_img[3], page_coord_img[0]], [page_coord_img[3], page_coord_img[1]], [page_coord_img[2], page_coord_img[1]]]) )
        return text_regions_p_true, erosion_hurts, polygons_lines_xml, polygons_of_images_fin, image_page, page_coord, cont_page
    def get_regions_light_v(self,img,is_image_enhanced, num_col_classifier):
        self.logger.debug("enter get_regions_light_v")
        erosion_hurts = False
@ -2425,6 +2590,7 @@ class Eynollah:
        prediction_table_erode = cv2.erode(prediction_table[:,:,0], KERNEL, iterations=20)
        prediction_table_erode = cv2.dilate(prediction_table_erode, KERNEL, iterations=20)
        return prediction_table_erode.astype(np.int16)
    def run_graphics_and_columns_light(self, text_regions_p_1, textline_mask_tot_ea, num_col_classifier, num_column_is_classified, erosion_hurts):
        img_g = self.imread(grayscale=True, uint8=True)
@ -2826,7 +2992,6 @@ class Eynollah:
        """
        self.logger.debug("enter run")
        t0_tot = time.time()
        if not self.dir_in:
@ -2837,6 +3002,24 @@ class Eynollah:
            if self.dir_in:
                self.reset_file_name_dir(os.path.join(self.dir_in,img_name))
            if self.extract_only_images:
                img_res, is_image_enhanced, num_col_classifier, num_column_is_classified = self.run_enhancement(self.light_version)
                self.logger.info("Enhancing took %.1fs ", time.time() - t0)
                text_regions_p_1 ,erosion_hurts, polygons_lines_xml,polygons_of_images,image_page, page_coord, cont_page = self.get_regions_light_v_extract_only_images(img_res, is_image_enhanced, num_col_classifier)
                pcgts = self.writer.build_pagexml_no_full_layout([], page_coord, [], [], [], [], polygons_of_images, [], [], [], [], [], cont_page, [], [])
                if self.plotter:
                    self.plotter.write_images_into_directory(polygons_of_images, image_page)
                if self.dir_in:
                    self.writer.write_pagexml(pcgts)
                else:
                    return pcgts
            else:
                img_res, is_image_enhanced, num_col_classifier, num_column_is_classified = self.run_enhancement(self.light_version)
                self.logger.info("Enhancing took %.1fs ", time.time() - t0)
@ -3091,6 +3274,7 @@ class Eynollah:
                    pcgts = self.writer.build_pagexml_full_layout(contours_only_text_parent, contours_only_text_parent_h, page_coord, order_text_new, id_of_texts_tot, all_found_textline_polygons, all_found_textline_polygons_h, all_box_coord, all_box_coord_h, polygons_of_images, contours_tables, polygons_of_drop_capitals, polygons_of_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_h, slopes_marginals, cont_page, polygons_lines_xml)
                    self.logger.info("Job done in %.1fs", time.time() - t0)
                    if not self.dir_in:
                        return pcgts
                else:
--- a/src/eynollah/ocrd-tool.json
+++ b/src/eynollah/ocrd-tool.json
@ -52,10 +52,10 @@
      },
      "resources": [
 	{
-	  "description": "models for eynollah (TensorFlow format)",
+	  "description": "models for eynollah (TensorFlow SavedModel format)",
-	  "url": "https://github.com/qurator-spk/eynollah/releases/download/v0.3.0/models_eynollah.tar.gz",
+	  "url": "https://github.com/qurator-spk/eynollah/releases/download/v0.3.1/models_eynollah.tar.gz",
 	  "name": "default",
-	  "size": 1761991295,
+	  "size": 1894627041,
 	  "type": "archive",
 	  "path_in_archive": "models_eynollah"
 	}
--- a/src/eynollah/writer.py
+++ b/src/eynollah/writer.py
@ -172,10 +172,18 @@ class EynollahXmlWriter():
            page.add_ImageRegion(img_region)
            points_co = ''
            for lmm in range(len(found_polygons_text_region_img[mm])):
                try:
                    points_co += str(int((found_polygons_text_region_img[mm][lmm,0,0] + page_coord[2]) / self.scale_x))
                    points_co += ','
                    points_co += str(int((found_polygons_text_region_img[mm][lmm,0,1] + page_coord[0]) / self.scale_y))
                    points_co += ' '
                except:
                    points_co +=  str(int((found_polygons_text_region_img[mm][lmm][0] + page_coord[2])/ self.scale_x  ))
                    points_co += ','
                    points_co += str(int((found_polygons_text_region_img[mm][lmm][1] + page_coord[0])/ self.scale_y  ))
                    points_co += ' '
            img_region.get_Coords().set_points(points_co[:-1])
        for mm in range(len(polygons_lines_to_be_written_in_xml)):