From 735abc43f3102e8cf35d71dff2daeffc4a1cfeac Mon Sep 17 00:00:00 2001
From: vahid <vahid@linux-x3no.fritz.box>
Date: Thu, 28 Apr 2022 01:14:57 +0200
Subject: [PATCH] option to ignore page extraction

---
 qurator/eynollah/cli.py      |   8 ++
 qurator/eynollah/eynollah.py | 240 +++++++++++++----------------------
 2 files changed, 97 insertions(+), 151 deletions(-)

diff --git a/qurator/eynollah/cli.py b/qurator/eynollah/cli.py
index ca938c4..18ea583 100644
--- a/qurator/eynollah/cli.py
+++ b/qurator/eynollah/cli.py
@@ -108,6 +108,12 @@ from qurator.eynollah.eynollah import Eynollah
     is_flag=True,
     help="if this parameter set to true, this tool would use lighter version",
 )
+@click.option(
+    "--ignore_page_extraction/--extract_page_included",
+    "-ipe/-epi",
+    is_flag=True,
+    help="if this parameter set to true, this tool would ignore page extraction",
+)
 @click.option(
     "--log-level",
     "-l",
@@ -132,6 +138,7 @@ def main(
     allow_scaling,
     headers_off,
     light_version,
+    ignore_page_extraction,
     log_level
 ):
     if log_level:
@@ -161,6 +168,7 @@ def main(
         allow_scaling=allow_scaling,
         headers_off=headers_off,
         light_version=light_version,
+        ignore_page_extraction=ignore_page_extraction,
     )
     eynollah.run()
     #pcgts = eynollah.run()
diff --git a/qurator/eynollah/eynollah.py b/qurator/eynollah/eynollah.py
index 48a640c..8957248 100644
--- a/qurator/eynollah/eynollah.py
+++ b/qurator/eynollah/eynollah.py
@@ -105,6 +105,7 @@ class Eynollah:
         allow_scaling=False,
         headers_off=False,
         light_version=False,
+        ignore_page_extraction=False,
         override_dpi=None,
         logger=None,
         pcgts=None,
@@ -133,6 +134,7 @@ class Eynollah:
         self.allow_scaling = allow_scaling
         self.headers_off = headers_off
         self.light_version = light_version
+        self.ignore_page_extraction = ignore_page_extraction
         self.pcgts = pcgts
         if not dir_in:
             self.plotter = None if not enable_plotting else EynollahPlotter(
@@ -886,169 +888,100 @@ class Eynollah:
         gc.collect()
         return prediction_true
 
-    def early_page_for_num_of_column_classification(self,img_bin):
-        self.logger.debug("enter early_page_for_num_of_column_classification")
-        if self.input_binary:
-            img =np.copy(img_bin)
-            img = img.astype(np.uint8)
-        else:
-            img = self.imread()
-        if not self.dir_in:
-            model_page, session_page = self.start_new_session_and_model(self.model_page_dir)
-        img = cv2.GaussianBlur(img, (5, 5), 0)
-        if self.dir_in:
-            img_page_prediction = self.do_prediction(False, img, self.model_page)
-        else:
-            img_page_prediction = self.do_prediction(False, img, model_page)
-
-        imgray = cv2.cvtColor(img_page_prediction, cv2.COLOR_BGR2GRAY)
-        _, thresh = cv2.threshold(imgray, 0, 255, 0)
-        thresh = cv2.dilate(thresh, KERNEL, iterations=3)
-        contours, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
-        if len(contours)>0:
-            cnt_size = np.array([cv2.contourArea(contours[j]) for j in range(len(contours))])
-            cnt = contours[np.argmax(cnt_size)]
-            x, y, w, h = cv2.boundingRect(cnt)
-            box = [x, y, w, h]
-        else:
-            box = [0, 0, img.shape[1], img.shape[0]]
-        croped_page, page_coord = crop_image_inside_box(box, img)
-        if not self.dir_in:
-            session_page.close()
-            del model_page
-            del session_page
-            K.clear_session()
-        gc.collect()
-        self.logger.debug("exit early_page_for_num_of_column_classification")
-        return croped_page, page_coord
-
     def extract_page(self):
         self.logger.debug("enter extract_page")
         cont_page = []
-        if not self.dir_in:
-            model_page, session_page = self.start_new_session_and_model(self.model_page_dir)
-        img = cv2.GaussianBlur(self.image, (5, 5), 0)
-        if not self.dir_in:
-            img_page_prediction = self.do_prediction(False, img, model_page)
-        else:
-            img_page_prediction = self.do_prediction(False, img, self.model_page)
-        imgray = cv2.cvtColor(img_page_prediction, cv2.COLOR_BGR2GRAY)
-        _, thresh = cv2.threshold(imgray, 0, 255, 0)
-        thresh = cv2.dilate(thresh, KERNEL, iterations=3)
-        contours, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
-        
-        if len(contours)>0:
-            cnt_size = np.array([cv2.contourArea(contours[j]) for j in range(len(contours))])
-            cnt = contours[np.argmax(cnt_size)]
-            x, y, w, h = cv2.boundingRect(cnt)
-            if x <= 30:
-                w += x
-                x = 0
-            if (self.image.shape[1] - (x + w)) <= 30:
-                w = w + (self.image.shape[1] - (x + w))
-            if y <= 30:
-                h = h + y
-                y = 0
-            if (self.image.shape[0] - (y + h)) <= 30:
-                h = h + (self.image.shape[0] - (y + h))
+        if not self.ignore_page_extraction:
+            if not self.dir_in:
+                model_page, session_page = self.start_new_session_and_model(self.model_page_dir)
+            img = cv2.GaussianBlur(self.image, (5, 5), 0)
+            if not self.dir_in:
+                img_page_prediction = self.do_prediction(False, img, model_page)
+            else:
+                img_page_prediction = self.do_prediction(False, img, self.model_page)
+            imgray = cv2.cvtColor(img_page_prediction, cv2.COLOR_BGR2GRAY)
+            _, thresh = cv2.threshold(imgray, 0, 255, 0)
+            thresh = cv2.dilate(thresh, KERNEL, iterations=3)
+            contours, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
+            
+            if len(contours)>0:
+                cnt_size = np.array([cv2.contourArea(contours[j]) for j in range(len(contours))])
+                cnt = contours[np.argmax(cnt_size)]
+                x, y, w, h = cv2.boundingRect(cnt)
+                if x <= 30:
+                    w += x
+                    x = 0
+                if (self.image.shape[1] - (x + w)) <= 30:
+                    w = w + (self.image.shape[1] - (x + w))
+                if y <= 30:
+                    h = h + y
+                    y = 0
+                if (self.image.shape[0] - (y + h)) <= 30:
+                    h = h + (self.image.shape[0] - (y + h))
 
-            box = [x, y, w, h]
+                box = [x, y, w, h]
+            else:
+                box = [0, 0, img.shape[1], img.shape[0]]
+            croped_page, page_coord = crop_image_inside_box(box, self.image)
+            cont_page.append(np.array([[page_coord[2], page_coord[0]], [page_coord[3], page_coord[0]], [page_coord[3], page_coord[1]], [page_coord[2], page_coord[1]]]))
+            if not self.dir_in:
+                session_page.close()
+                del model_page
+                del session_page
+                K.clear_session()
+            gc.collect()
+            self.logger.debug("exit extract_page")
         else:
-            box = [0, 0, img.shape[1], img.shape[0]]
-        croped_page, page_coord = crop_image_inside_box(box, self.image)
-        cont_page.append(np.array([[page_coord[2], page_coord[0]], [page_coord[3], page_coord[0]], [page_coord[3], page_coord[1]], [page_coord[2], page_coord[1]]]))
-        if not self.dir_in:
-            session_page.close()
-            del model_page
-            del session_page
-            K.clear_session()
-        gc.collect()
-        self.logger.debug("exit extract_page")
+            box = [0, 0, self.image.shape[1], self.image.shape[0]]
+            croped_page, page_coord = crop_image_inside_box(box, self.image)
+            cont_page.append(np.array([[page_coord[2], page_coord[0]], [page_coord[3], page_coord[0]], [page_coord[3], page_coord[1]], [page_coord[2], page_coord[1]]]))
         return croped_page, page_coord, cont_page
 
     def early_page_for_num_of_column_classification(self,img_bin):
-        self.logger.debug("enter early_page_for_num_of_column_classification")
-        if self.input_binary:
-            img =np.copy(img_bin)
-            img = img.astype(np.uint8)
+        if not self.ignore_page_extraction:
+            self.logger.debug("enter early_page_for_num_of_column_classification")
+            if self.input_binary:
+                img =np.copy(img_bin)
+                img = img.astype(np.uint8)
+            else:
+                img = self.imread()
+            if not self.dir_in:
+                model_page, session_page = self.start_new_session_and_model(self.model_page_dir)
+            img = cv2.GaussianBlur(img, (5, 5), 0)
+            
+            if self.dir_in:
+                img_page_prediction = self.do_prediction(False, img, self.model_page)
+            else:
+                img_page_prediction = self.do_prediction(False, img, model_page)
+
+            imgray = cv2.cvtColor(img_page_prediction, cv2.COLOR_BGR2GRAY)
+            _, thresh = cv2.threshold(imgray, 0, 255, 0)
+            thresh = cv2.dilate(thresh, KERNEL, iterations=3)
+            contours, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
+            if len(contours)>0:
+                cnt_size = np.array([cv2.contourArea(contours[j]) for j in range(len(contours))])
+                cnt = contours[np.argmax(cnt_size)]
+                x, y, w, h = cv2.boundingRect(cnt)
+                box = [x, y, w, h]
+            else:
+                box = [0, 0, img.shape[1], img.shape[0]]
+            croped_page, page_coord = crop_image_inside_box(box, img)
+            
+            if not self.dir_in:
+                session_page.close()
+                del model_page
+                del session_page
+                K.clear_session()
+            
+            gc.collect()
+            
+            self.logger.debug("exit early_page_for_num_of_column_classification")
         else:
             img = self.imread()
-        if not self.dir_in:
-            model_page, session_page = self.start_new_session_and_model(self.model_page_dir)
-        img = cv2.GaussianBlur(img, (5, 5), 0)
-        
-        if self.dir_in:
-            img_page_prediction = self.do_prediction(False, img, self.model_page)
-        else:
-            img_page_prediction = self.do_prediction(False, img, model_page)
-
-        imgray = cv2.cvtColor(img_page_prediction, cv2.COLOR_BGR2GRAY)
-        _, thresh = cv2.threshold(imgray, 0, 255, 0)
-        thresh = cv2.dilate(thresh, KERNEL, iterations=3)
-        contours, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
-        if len(contours)>0:
-            cnt_size = np.array([cv2.contourArea(contours[j]) for j in range(len(contours))])
-            cnt = contours[np.argmax(cnt_size)]
-            x, y, w, h = cv2.boundingRect(cnt)
-            box = [x, y, w, h]
-        else:
             box = [0, 0, img.shape[1], img.shape[0]]
-        croped_page, page_coord = crop_image_inside_box(box, img)
-        
-        if not self.dir_in:
-            session_page.close()
-            del model_page
-            del session_page
-            K.clear_session()
-        
-        gc.collect()
-        
-        self.logger.debug("exit early_page_for_num_of_column_classification")
+            croped_page, page_coord = crop_image_inside_box(box, img)
         return croped_page, page_coord
 
-    def extract_page(self):
-        self.logger.debug("enter extract_page")
-        cont_page = []
-        if not self.dir_in:
-            model_page, session_page = self.start_new_session_and_model(self.model_page_dir)
-        img = cv2.GaussianBlur(self.image, (5, 5), 0)
-        if not self.dir_in:
-            img_page_prediction = self.do_prediction(False, img, model_page)
-        else:
-            img_page_prediction = self.do_prediction(False, img, self.model_page)
-        imgray = cv2.cvtColor(img_page_prediction, cv2.COLOR_BGR2GRAY)
-        _, thresh = cv2.threshold(imgray, 0, 255, 0)
-        thresh = cv2.dilate(thresh, KERNEL, iterations=3)
-        contours, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
-        
-        if len(contours)>0:
-            cnt_size = np.array([cv2.contourArea(contours[j]) for j in range(len(contours))])
-            cnt = contours[np.argmax(cnt_size)]
-            x, y, w, h = cv2.boundingRect(cnt)
-            if x <= 30:
-                w += x
-                x = 0
-            if (self.image.shape[1] - (x + w)) <= 30:
-                w = w + (self.image.shape[1] - (x + w))
-            if y <= 30:
-                h = h + y
-                y = 0
-            if (self.image.shape[0] - (y + h)) <= 30:
-                h = h + (self.image.shape[0] - (y + h))
-
-            box = [x, y, w, h]
-        else:
-            box = [0, 0, img.shape[1], img.shape[0]]
-        croped_page, page_coord = crop_image_inside_box(box, self.image)
-        cont_page.append(np.array([[page_coord[2], page_coord[0]], [page_coord[3], page_coord[0]], [page_coord[3], page_coord[1]], [page_coord[2], page_coord[1]]]))
-        if not self.dir_in:
-            session_page.close()
-            del model_page
-            del session_page
-            K.clear_session()
-        gc.collect()
-        self.logger.debug("exit extract_page")
-        return croped_page, page_coord, cont_page
 
     def extract_text_regions(self, img, patches, cols):
         self.logger.debug("enter extract_text_regions")
@@ -2960,10 +2893,15 @@ class Eynollah:
                 #self.logger.info('cont_page %s', cont_page)
             
             if not num_col:
+                print('buraya galir??')
                 self.logger.info("No columns detected, outputting an empty PAGE-XML")
                 pcgts = self.writer.build_pagexml_no_full_layout([], page_coord, [], [], [], [], [], [], [], [], [], [], cont_page, [], [])
                 self.logger.info("Job done in %.1fs", time.time() - t1)
-                return pcgts
+                if self.dir_in:
+                    self.writer.write_pagexml(pcgts)
+                    continue
+                else:
+                    return pcgts
 
             t1 = time.time()
             if not self.light_version: