option to ignore page extraction

2025-12-15 07:34:12 +01:00 · 2022-04-28 01:14:57 +02:00 · 2022-04-28 01:14:57 +02:00 · 735abc43f3
commit 735abc43f3
parent 3bbbeecfec
2 changed files with 97 additions and 151 deletions
--- a/qurator/eynollah/cli.py
+++ b/qurator/eynollah/cli.py
@ -108,6 +108,12 @@ from qurator.eynollah.eynollah import Eynollah
    is_flag=True,
    help="if this parameter set to true, this tool would use lighter version",
 )
@click.option(
    "--ignore_page_extraction/--extract_page_included",
    "-ipe/-epi",
    is_flag=True,
    help="if this parameter set to true, this tool would ignore page extraction",
 )
@click.option(
    "--log-level",
    "-l",
@ -132,6 +138,7 @@ def main(
    allow_scaling,
    headers_off,
    light_version,
    ignore_page_extraction,
    log_level
 ):
    if log_level:
@ -161,6 +168,7 @@ def main(
        allow_scaling=allow_scaling,
        headers_off=headers_off,
        light_version=light_version,
        ignore_page_extraction=ignore_page_extraction,
    )
    eynollah.run()
    #pcgts = eynollah.run()
--- a/qurator/eynollah/eynollah.py
+++ b/qurator/eynollah/eynollah.py
@ -105,6 +105,7 @@ class Eynollah:
        allow_scaling=False,
        headers_off=False,
        light_version=False,
        ignore_page_extraction=False,
        override_dpi=None,
        logger=None,
        pcgts=None,
@ -133,6 +134,7 @@ class Eynollah:
        self.allow_scaling = allow_scaling
        self.headers_off = headers_off
        self.light_version = light_version
        self.ignore_page_extraction = ignore_page_extraction
        self.pcgts = pcgts
        if not dir_in:
            self.plotter = None if not enable_plotting else EynollahPlotter(
@ -886,45 +888,10 @@ class Eynollah:
        gc.collect()
        return prediction_true
    def early_page_for_num_of_column_classification(self,img_bin):
        self.logger.debug("enter early_page_for_num_of_column_classification")
        if self.input_binary:
            img =np.copy(img_bin)
            img = img.astype(np.uint8)
        else:
            img = self.imread()
        if not self.dir_in:
            model_page, session_page = self.start_new_session_and_model(self.model_page_dir)
        img = cv2.GaussianBlur(img, (5, 5), 0)
        if self.dir_in:
            img_page_prediction = self.do_prediction(False, img, self.model_page)
        else:
            img_page_prediction = self.do_prediction(False, img, model_page)
        imgray = cv2.cvtColor(img_page_prediction, cv2.COLOR_BGR2GRAY)
        _, thresh = cv2.threshold(imgray, 0, 255, 0)
        thresh = cv2.dilate(thresh, KERNEL, iterations=3)
        contours, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
        if len(contours)>0:
            cnt_size = np.array([cv2.contourArea(contours[j]) for j in range(len(contours))])
            cnt = contours[np.argmax(cnt_size)]
            x, y, w, h = cv2.boundingRect(cnt)
            box = [x, y, w, h]
        else:
            box = [0, 0, img.shape[1], img.shape[0]]
        croped_page, page_coord = crop_image_inside_box(box, img)
        if not self.dir_in:
            session_page.close()
            del model_page
            del session_page
            K.clear_session()
        gc.collect()
        self.logger.debug("exit early_page_for_num_of_column_classification")
        return croped_page, page_coord
    def extract_page(self):
        self.logger.debug("enter extract_page")
        cont_page = []
        if not self.ignore_page_extraction:
            if not self.dir_in:
                model_page, session_page = self.start_new_session_and_model(self.model_page_dir)
            img = cv2.GaussianBlur(self.image, (5, 5), 0)
@ -964,9 +931,14 @@ class Eynollah:
                K.clear_session()
            gc.collect()
            self.logger.debug("exit extract_page")
        else:
            box = [0, 0, self.image.shape[1], self.image.shape[0]]
            croped_page, page_coord = crop_image_inside_box(box, self.image)
            cont_page.append(np.array([[page_coord[2], page_coord[0]], [page_coord[3], page_coord[0]], [page_coord[3], page_coord[1]], [page_coord[2], page_coord[1]]]))
        return croped_page, page_coord, cont_page
    def early_page_for_num_of_column_classification(self,img_bin):
        if not self.ignore_page_extraction:
            self.logger.debug("enter early_page_for_num_of_column_classification")
            if self.input_binary:
                img =np.copy(img_bin)
@ -1004,51 +976,12 @@ class Eynollah:
            gc.collect()
            self.logger.debug("exit early_page_for_num_of_column_classification")
        else:
            img = self.imread()
            box = [0, 0, img.shape[1], img.shape[0]]
            croped_page, page_coord = crop_image_inside_box(box, img)
        return croped_page, page_coord
    def extract_page(self):
        self.logger.debug("enter extract_page")
        cont_page = []
        if not self.dir_in:
            model_page, session_page = self.start_new_session_and_model(self.model_page_dir)
        img = cv2.GaussianBlur(self.image, (5, 5), 0)
        if not self.dir_in:
            img_page_prediction = self.do_prediction(False, img, model_page)
        else:
            img_page_prediction = self.do_prediction(False, img, self.model_page)
        imgray = cv2.cvtColor(img_page_prediction, cv2.COLOR_BGR2GRAY)
        _, thresh = cv2.threshold(imgray, 0, 255, 0)
        thresh = cv2.dilate(thresh, KERNEL, iterations=3)
        contours, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
        if len(contours)>0:
            cnt_size = np.array([cv2.contourArea(contours[j]) for j in range(len(contours))])
            cnt = contours[np.argmax(cnt_size)]
            x, y, w, h = cv2.boundingRect(cnt)
            if x <= 30:
                w += x
                x = 0
            if (self.image.shape[1] - (x + w)) <= 30:
                w = w + (self.image.shape[1] - (x + w))
            if y <= 30:
                h = h + y
                y = 0
            if (self.image.shape[0] - (y + h)) <= 30:
                h = h + (self.image.shape[0] - (y + h))
            box = [x, y, w, h]
        else:
            box = [0, 0, img.shape[1], img.shape[0]]
        croped_page, page_coord = crop_image_inside_box(box, self.image)
        cont_page.append(np.array([[page_coord[2], page_coord[0]], [page_coord[3], page_coord[0]], [page_coord[3], page_coord[1]], [page_coord[2], page_coord[1]]]))
        if not self.dir_in:
            session_page.close()
            del model_page
            del session_page
            K.clear_session()
        gc.collect()
        self.logger.debug("exit extract_page")
        return croped_page, page_coord, cont_page
    def extract_text_regions(self, img, patches, cols):
        self.logger.debug("enter extract_text_regions")
@ -2960,9 +2893,14 @@ class Eynollah:
                #self.logger.info('cont_page %s', cont_page)
            if not num_col:
                print('buraya galir??')
                self.logger.info("No columns detected, outputting an empty PAGE-XML")
                pcgts = self.writer.build_pagexml_no_full_layout([], page_coord, [], [], [], [], [], [], [], [], [], [], cont_page, [], [])
                self.logger.info("Job done in %.1fs", time.time() - t1)
                if self.dir_in:
                    self.writer.write_pagexml(pcgts)
                    continue
                else:
                    return pcgts
            t1 = time.time()