From 2bef2d4c049eb7c760e26b397ae72264bc1627c8 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Wed, 27 Jan 2021 11:51:23 +0100
Subject: [PATCH 01/89] use pathlib to determine f_name

---
 sbb_newspapers_org_image/eynollah.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index 979a5f2..efb7e98 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -9,6 +9,7 @@ import random
 import sys
 import time
 import warnings
+from pathlib import Path
 from multiprocessing import Process, Queue, cpu_count
 from sys import getsizeof
 
@@ -148,12 +149,8 @@ class eynollah:
         self.headers_off = headers_off
         self.dir_of_deskewed = dir_of_deskewed
         self.dir_of_all = dir_of_all
-        if self.f_name is None:
-            try:
-                self.f_name = image_dir.split("/")[len(image_dir.split("/")) - 1]
-                self.f_name = self.f_name.split(".")[0]
-            except:
-                self.f_name = self.f_name.split(".")[0]
+        if not self.f_name:
+            self.f_name = Path(Path(image_dir).name).stem
         self.dir_models = dir_models
         self.kernel = np.ones((5, 5), np.uint8)
 

From 19895b01948fd82db4414d862bd93425da1a8c30 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Wed, 27 Jan 2021 11:59:42 +0100
Subject: [PATCH 02/89] rename: image_{dir,filename},
 {f_name,image_filename_stem}

---
 sbb_newspapers_org_image/eynollah.py | 82 ++++++++++++++--------------
 1 file changed, 41 insertions(+), 41 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index efb7e98..b4f5960 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -123,8 +123,8 @@ SLOPE_THRESHOLD = 0.13
 class eynollah:
     def __init__(
         self,
-        image_dir,
-        f_name,
+        image_filename,
+        image_filename_stem,
         dir_out,
         dir_models,
         dir_of_cropped_images=None,
@@ -137,9 +137,9 @@ class eynollah:
         allow_scaling=False,
         headers_off=False
     ):
-        self.image_dir = image_dir  # XXX This does not seem to be a directory as the name suggests, but a file
+        self.image_filename = image_filename  # XXX This does not seem to be a directory as the name suggests, but a file
         self.dir_out = dir_out
-        self.f_name = f_name
+        self.image_filename_stem = image_filename_stem
         self.dir_of_cropped_images = dir_of_cropped_images
         self.allow_enhancement = allow_enhancement
         self.curved_line = curved_line
@@ -149,8 +149,8 @@ class eynollah:
         self.headers_off = headers_off
         self.dir_of_deskewed = dir_of_deskewed
         self.dir_of_all = dir_of_all
-        if not self.f_name:
-            self.f_name = Path(Path(image_dir).name).stem
+        if not self.image_filename_stem:
+            self.image_filename_stem = Path(Path(image_filename).name).stem
         self.dir_models = dir_models
         self.kernel = np.ones((5, 5), np.uint8)
 
@@ -291,18 +291,18 @@ class eynollah:
             return prediction_true
 
     def check_dpi(self):
-        dpi = os.popen('identify -format "%x " ' + self.image_dir).read()
+        dpi = os.popen('identify -format "%x " ' + self.image_filename).read()
         return int(float(dpi))
 
     def resize_image_with_column_classifier(self, is_image_enhanced):
         dpi = self.check_dpi()
-        img = cv2.imread(self.image_dir)
+        img = cv2.imread(self.image_filename)
         img = img.astype(np.uint8)
 
         _, page_coord = self.early_page_for_num_of_column_classification()
         model_num_classifier, session_col_classifier = self.start_new_session_and_model(self.model_dir_of_col_classifier)
 
-        img_1ch = cv2.imread(self.image_dir, 0)
+        img_1ch = cv2.imread(self.image_filename, 0)
 
         width_early = img_1ch.shape[1]
 
@@ -414,14 +414,14 @@ class eynollah:
 
     def resize_and_enhance_image_with_column_classifier(self, is_image_enhanced):
         dpi = self.check_dpi()
-        img = cv2.imread(self.image_dir)
+        img = cv2.imread(self.image_filename)
 
         img = img.astype(np.uint8)
 
         _, page_coord = self.early_page_for_num_of_column_classification()
         model_num_classifier, session_col_classifier = self.start_new_session_and_model(self.model_dir_of_col_classifier)
 
-        img_1ch = cv2.imread(self.image_dir, 0)
+        img_1ch = cv2.imread(self.image_filename, 0)
         img_1ch = img_1ch.astype(np.uint8)
 
         width_early = img_1ch.shape[1]
@@ -538,7 +538,7 @@ class eynollah:
 
             # img_new=resize_image(img,img_h_new,img_w_new)
             image_res = self.predict_enhancement(img_new)
-            # cv2.imwrite(os.path.join(self.dir_out, self.f_name) + ".tif",self.image)
+            # cv2.imwrite(os.path.join(self.dir_out, self.image_filename_stem) + ".tif",self.image)
             # self.image=self.image.astype(np.uint16)
 
             # self.scale_x=1
@@ -553,7 +553,7 @@ class eynollah:
                 img_w_new=int(img.shape[1]/float(img.shape[0]) * 3000)
                 img_new=resize_image(img,img_h_new,img_w_new)
                 image_res=self.predict_enhancement(img_new)
-                #cv2.imwrite(os.path.join(self.dir_out, self.f_name) + ".tif",self.image)
+                #cv2.imwrite(os.path.join(self.dir_out, self.image_filename_stem) + ".tif",self.image)
                 #self.image=self.image.astype(np.uint16)
                 ##self.scale_x=1
                 ##self.scale_y=1
@@ -588,7 +588,7 @@ class eynollah:
 
     def get_image_and_scales_after_enhancing(self, img_org, img_res):
 
-        # self.image = cv2.imread(self.image_dir)
+        # self.image = cv2.imread(self.image_filename)
 
         self.image = np.copy(img_res)
         self.image = self.image.astype(np.uint8)
@@ -783,7 +783,7 @@ class eynollah:
         return prediction_true
 
     def early_page_for_num_of_column_classification(self):
-        img = cv2.imread(self.image_dir)
+        img = cv2.imread(self.image_filename)
         img = img.astype(np.uint8)
         patches = False
         model_page, session_page = self.start_new_session_and_model(self.model_page_dir)
@@ -1166,7 +1166,7 @@ class eynollah:
 
                     img_int_p[img_int_p > 0] = 1
                     # slope_for_all=self.return_deskew_slope_new(img_int_p,sigma_des)
-                    slope_for_all = return_deskew_slop(img_int_p, sigma_des, dir_of_all=self.dir_of_all, f_name=self.f_name)
+                    slope_for_all = return_deskew_slop(img_int_p, sigma_des, dir_of_all=self.dir_of_all, image_filename_stem=self.image_filename_stem)
 
                     if abs(slope_for_all) < 0.5:
                         slope_for_all = [slope_deskew][0]
@@ -1177,7 +1177,7 @@ class eynollah:
                 except:
                     slope_for_all = 999
 
-                ##slope_for_all=return_deskew_slop(img_int_p,sigma_des, dir_of_all=self.dir_of_all, f_name=self.f_name)
+                ##slope_for_all=return_deskew_slop(img_int_p,sigma_des, dir_of_all=self.dir_of_all, image_filename_stem=self.image_filename_stem)
 
                 if slope_for_all == 999:
                     slope_for_all = [slope_deskew][0]
@@ -1207,7 +1207,7 @@ class eynollah:
                 textline_biggest_region = mask_biggest * textline_mask_tot_ea
 
                 # print(slope_for_all,'slope_for_all')
-                textline_rotated_seperated = seperate_lines_new2(textline_biggest_region[y : y + h, x : x + w], 0, num_col, slope_for_all, self.dir_of_all, self.f_name)
+                textline_rotated_seperated = seperate_lines_new2(textline_biggest_region[y : y + h, x : x + w], 0, num_col, slope_for_all, self.dir_of_all, self.image_filename_stem)
 
                 # new line added
                 ##print(np.shape(textline_rotated_seperated),np.shape(mask_biggest))
@@ -1329,7 +1329,7 @@ class eynollah:
 
                     img_int_p[img_int_p > 0] = 1
                     # slope_for_all=self.return_deskew_slope_new(img_int_p,sigma_des)
-                    slope_for_all = return_deskew_slop(img_int_p, sigma_des, dir_of_all=self.dir_of_all, f_name=self.f_name)
+                    slope_for_all = return_deskew_slop(img_int_p, sigma_des, dir_of_all=self.dir_of_all, image_filename_stem=self.image_filename_stem)
 
                     if abs(slope_for_all) <= 0.5:
                         slope_for_all = [slope_deskew][0]
@@ -1337,7 +1337,7 @@ class eynollah:
                 except:
                     slope_for_all = 999
 
-                ##slope_for_all=return_deskew_slop(img_int_p,sigma_des, dir_of_all=self.dir_of_all, f_name=self.f_name)
+                ##slope_for_all=return_deskew_slop(img_int_p,sigma_des, dir_of_all=self.dir_of_all, image_filename_stem=self.image_filename_stem)
 
                 if slope_for_all == 999:
                     slope_for_all = [slope_deskew][0]
@@ -1450,7 +1450,7 @@ class eynollah:
                     sigma_des = 1
 
                 crop_img[crop_img > 0] = 1
-                slope_corresponding_textregion = return_deskew_slop(crop_img, sigma_des, dir_of_all=self.dir_of_all, f_name=self.f_name)
+                slope_corresponding_textregion = return_deskew_slop(crop_img, sigma_des, dir_of_all=self.dir_of_all, image_filename_stem=self.image_filename_stem)
 
             except:
                 slope_corresponding_textregion = 999
@@ -1478,7 +1478,7 @@ class eynollah:
         found_polygons_text_region_h = contours_h
 
         # create the file structure
-        pcgts, page = create_page_xml(self.image_dir, self.height_org, self.width_org)
+        pcgts, page = create_page_xml(self.image_filename, self.height_org, self.width_org)
 
         page_print_sub = ET.SubElement(page, "PrintSpace")
         coord_page = ET.SubElement(page_print_sub, "Coords")
@@ -1995,7 +1995,7 @@ class eynollah:
         print(self.f_name)
         print(os.path.join(dir_of_image, self.f_name) + ".xml")
         tree = ET.ElementTree(pcgts)
-        tree.write(os.path.join(dir_of_image, self.f_name) + ".xml")
+        tree.write(os.path.join(dir_of_image, self.image_filename_stem) + ".xml")
     
 
     def write_into_page_xml(self, contours, page_coord, dir_of_image, order_of_texts, id_of_texts, all_found_texline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, curved_line, slopes, slopes_marginals):
@@ -2004,7 +2004,7 @@ class eynollah:
         ##found_polygons_text_region_h=contours_h
 
         # create the file structure
-        pcgts, page = create_page_xml(self.image_dir, self.height_org, self.width_org)
+        pcgts, page = create_page_xml(self.image_filename, self.height_org, self.width_org)
         page_print_sub = ET.SubElement(page, "PrintSpace")
         coord_page = ET.SubElement(page_print_sub, "Coords")
         points_page_print = ""
@@ -2310,11 +2310,11 @@ class eynollah:
             pass
 
 
-        print(self.f_name)
-        # print(os.path.join(dir_of_image, self.f_name) + ".xml")
+        print(self.image_filename_stem)
+        # print(os.path.join(dir_of_image, self.image_filename_stem) + ".xml")
         tree = ET.ElementTree(pcgts)
-        tree.write(os.path.join(dir_of_image, self.f_name) + ".xml")
-        # cv2.imwrite(os.path.join(dir_of_image, self.f_name) + ".tif",self.image_org)
+        tree.write(os.path.join(dir_of_image, self.image_filename_stem) + ".xml")
+        # cv2.imwrite(os.path.join(dir_of_image, self.image_filename_stem) + ".tif",self.image_org)
 
     def get_regions_from_xy_2models(self,img,is_image_enhanced):
         img_org=np.copy(img)
@@ -2597,7 +2597,7 @@ class eynollah:
 
             croped_page = resize_image(croped_page, int(croped_page.shape[0] / self.scale_y), int(croped_page.shape[1] / self.scale_x))
 
-            path = os.path.join(dir_of_cropped_imgs, self.f_name + "_" + str(index) + ".jpg")
+            path = os.path.join(dir_of_cropped_imgs, self.image_filename_stem + "_" + str(index) + ".jpg")
             cv2.imwrite(path, croped_page)
             index += 1
 
@@ -2898,7 +2898,7 @@ class eynollah:
         colors = [im.cmap(im.norm(value)) for value in values]
         patches = [mpatches.Patch(color=colors[np.where(values == i)[0][0]], label="{l}".format(l=pixels[int(np.where(values_indexes == i)[0][0])])) for i in values]
         plt.legend(handles=patches, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0, fontsize=40)
-        plt.savefig(os.path.join(self.dir_of_layout, self.f_name + "_layout_main.png"))
+        plt.savefig(os.path.join(self.dir_of_layout, self.image_filename_stem + "_layout_main.png"))
 
     def save_plot_of_layout_main_all(self, text_regions_p, image_page):
         values = np.unique(text_regions_p[:, :])
@@ -2918,7 +2918,7 @@ class eynollah:
         patches = [mpatches.Patch(color=colors[np.where(values == i)[0][0]], label="{l}".format(l=pixels[int(np.where(values_indexes == i)[0][0])])) for i in values]
         plt.legend(handles=patches, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0, fontsize=60)
 
-        plt.savefig(os.path.join(self.dir_of_all, self.f_name + "_layout_main_and_page.png"))
+        plt.savefig(os.path.join(self.dir_of_all, self.image_filename_stem + "_layout_main_and_page.png"))
 
     def save_plot_of_layout(self, text_regions_p, image_page):
         values = np.unique(text_regions_p[:, :])
@@ -2933,7 +2933,7 @@ class eynollah:
         colors = [im.cmap(im.norm(value)) for value in values]
         patches = [mpatches.Patch(color=colors[np.where(values == i)[0][0]], label="{l}".format(l=pixels[int(np.where(values_indexes == i)[0][0])])) for i in values]
         plt.legend(handles=patches, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0, fontsize=40)
-        plt.savefig(os.path.join(self.dir_of_layout, self.f_name + "_layout.png"))
+        plt.savefig(os.path.join(self.dir_of_layout, self.image_filename_stem + "_layout.png"))
 
     def save_plot_of_layout_all(self, text_regions_p, image_page):
         values = np.unique(text_regions_p[:, :])
@@ -2953,15 +2953,15 @@ class eynollah:
         patches = [mpatches.Patch(color=colors[np.where(values == i)[0][0]], label="{l}".format(l=pixels[int(np.where(values_indexes == i)[0][0])])) for i in values]
         plt.legend(handles=patches, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0, fontsize=60)
 
-        plt.savefig(os.path.join(self.dir_of_all, self.f_name + "_layout_and_page.png"))
+        plt.savefig(os.path.join(self.dir_of_all, self.image_filename_stem + "_layout_and_page.png"))
 
     def save_deskewed_image(self, slope_deskew):
         img_rotated = rotyate_image_different(self.image_org, slope_deskew)
 
         if self.dir_of_all is not None:
-            cv2.imwrite(os.path.join(self.dir_of_all, self.f_name + "_org.png"), self.image_org)
+            cv2.imwrite(os.path.join(self.dir_of_all, self.image_filename_stem + "_org.png"), self.image_org)
 
-        cv2.imwrite(os.path.join(self.dir_of_deskewed, self.f_name + "_deskewed.png"), img_rotated)
+        cv2.imwrite(os.path.join(self.dir_of_deskewed, self.image_filename_stem + "_deskewed.png"), img_rotated)
         del img_rotated
 
     def run(self):
@@ -2978,7 +2978,7 @@ class eynollah:
         K.clear_session()
         scale = 1
         if (self.allow_enhancement) and is_image_enhanced:
-            cv2.imwrite(os.path.join(self.dir_out, self.f_name) + ".tif", img_res)
+            cv2.imwrite(os.path.join(self.dir_out, self.image_filename_stem) + ".tif", img_res)
             img_res = img_res.astype(np.uint8)
             self.get_image_and_scales(img_org, img_res, scale)
 
@@ -3004,7 +3004,7 @@ class eynollah:
 
         print("textregion: " + str(time.time() - t1))
 
-        img_g = cv2.imread(self.image_dir, 0)
+        img_g = cv2.imread(self.image_filename, 0)
         img_g = img_g.astype(np.uint8)
 
         img_g3 = np.zeros((img_g.shape[0], img_g.shape[1], 3))
@@ -3021,7 +3021,7 @@ class eynollah:
         # print(image_page.shape,'page')
 
         if self.dir_of_all is not None:
-            cv2.imwrite(os.path.join(self.dir_of_all, self.f_name + "_page.png"), image_page)
+            cv2.imwrite(os.path.join(self.dir_of_all, self.image_filename_stem + "_page.png"), image_page)
         ##########
         K.clear_session()
         gc.collect()
@@ -3094,7 +3094,7 @@ class eynollah:
                     patches = [mpatches.Patch(color=colors[np.where(values == i)[0][0]], label="{l}".format(l=pixels[int(np.where(values_indexes == i)[0][0])])) for i in values]
                     plt.legend(handles=patches, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0, fontsize=60)
 
-                    plt.savefig(os.path.join(self.dir_of_all, self.f_name + "_textline_and_page.png"))
+                    plt.savefig(os.path.join(self.dir_of_all, self.image_filename_stem + "_textline_and_page.png"))
                 print("textline: " + str(time.time() - t1))
                 # plt.imshow(textline_mask_tot_ea)
                 # plt.show()
@@ -3102,8 +3102,8 @@ class eynollah:
 
                 sigma = 2
                 main_page_deskew = True
-                slope_deskew = return_deskew_slop(cv2.erode(textline_mask_tot_ea, self.kernel, iterations=2), sigma, main_page_deskew, dir_of_all=self.dir_of_all, f_name=self.f_name)
-                slope_first = 0  # return_deskew_slop(cv2.erode(textline_mask_tot_ea, self.kernel, iterations=2),sigma, dir_of_all=self.dir_of_all, f_name=self.f_name)
+                slope_deskew = return_deskew_slop(cv2.erode(textline_mask_tot_ea, self.kernel, iterations=2), sigma, main_page_deskew, dir_of_all=self.dir_of_all, image_filename_stem=self.image_filename_stem)
+                slope_first = 0  # return_deskew_slop(cv2.erode(textline_mask_tot_ea, self.kernel, iterations=2),sigma, dir_of_all=self.dir_of_all, image_filename_stem=self.image_filename_stem)
 
                 if self.dir_of_deskewed is not None:
                     self.save_deskewed_image(slope_deskew)

From f5e11a10566f5e510cc874e5ff644fd3e70f740a Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Wed, 27 Jan 2021 12:11:26 +0100
Subject: [PATCH 03/89] remove commented out code in eynollah.py

---
 sbb_newspapers_org_image/eynollah.py | 181 +--------------------------
 1 file changed, 3 insertions(+), 178 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index b4f5960..827d9ce 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -547,24 +547,6 @@ class eynollah:
             # self.width_org = self.image.shape[1]
             is_image_enhanced = True
         else:
-            """
-            if img.shape[0]<=2530 and img.shape[0]>=img.shape[1]:
-                img_h_new=3000
-                img_w_new=int(img.shape[1]/float(img.shape[0]) * 3000)
-                img_new=resize_image(img,img_h_new,img_w_new)
-                image_res=self.predict_enhancement(img_new)
-                #cv2.imwrite(os.path.join(self.dir_out, self.image_filename_stem) + ".tif",self.image)
-                #self.image=self.image.astype(np.uint16)
-                ##self.scale_x=1
-                ##self.scale_y=1
-                ##self.height_org = self.image.shape[0]
-                ##self.width_org = self.image.shape[1]
-                is_image_enhanced=True
-            else:
-                is_image_enhanced=False
-                image_res=np.copy(img)
-
-            """
             is_image_enhanced = False
             num_column_is_classified = True
             image_res = np.copy(img)
@@ -787,7 +769,6 @@ class eynollah:
         img = img.astype(np.uint8)
         patches = False
         model_page, session_page = self.start_new_session_and_model(self.model_page_dir)
-        ###img = otsu_copy(self.image)
         for ii in range(1):
             img = cv2.GaussianBlur(img, (5, 5), 0)
 
@@ -831,7 +812,6 @@ class eynollah:
     def extract_page(self):
         patches = False
         model_page, session_page = self.start_new_session_and_model(self.model_page_dir)
-        ###img = otsu_copy(self.image)
         for ii in range(1):
             img = cv2.GaussianBlur(self.image, (5, 5), 0)
 
@@ -883,28 +863,6 @@ class eynollah:
         img_height_h = img.shape[0]
         img_width_h = img.shape[1]
 
-        ###if patches and cols>=3 :
-        ###model_region, session_region = self.start_new_session_and_model(self.model_region_dir_fully)
-        ###if not patches:
-        ###model_region, session_region = self.start_new_session_and_model(self.model_region_dir_fully_np)
-
-        ###if patches and cols==2 :
-        ###model_region, session_region = self.start_new_session_and_model(self.model_region_dir_p_2col)
-
-        ###if patches and cols==1 :
-        ###model_region, session_region = self.start_new_session_and_model(self.model_region_dir_p_2col)
-
-        ###if patches and cols>=2:
-
-        ###img = otsu_copy_binary(img)#otsu_copy(img)
-        ###img = img.astype(np.uint8)
-
-        ###if patches and cols==1:
-
-        ###img = otsu_copy_binary(img)#otsu_copy(img)
-        ###img = img.astype(np.uint8)
-        ###img= resize_image(img, int(img_height_h*1), int(img_width_h*1) )
-
         if patches:
             model_region, session_region = self.start_new_session_and_model(self.model_region_dir_fully)
         if not patches:
@@ -1306,7 +1264,6 @@ class eynollah:
                 slope_for_all = [slope_deskew][0]
 
                 all_text_region_raw = textline_mask_tot_ea[boxes_text[mv][1] : boxes_text[mv][1] + boxes_text[mv][3], boxes_text[mv][0] : boxes_text[mv][0] + boxes_text[mv][2]]
-                ###cnt_clean_rot=textline_contours_postprocessing(all_text_region_raw,slopes[jj],contours_only_text_parent[jj],boxes_text[jj],slope_first)
                 cnt_clean_rot = textline_contours_postprocessing(all_text_region_raw, slope_for_all, contours_par_per_process[mv], boxes_text[mv], 0)
 
                 textlines_rectangles_per_each_subprocess.append(cnt_clean_rot)
@@ -1366,7 +1323,6 @@ class eynollah:
                 ##plt.show()
 
                 all_text_region_raw[mask_only_con_region == 0] = 0
-                ###cnt_clean_rot=textline_contours_postprocessing(all_text_region_raw,slopes[jj],contours_only_text_parent[jj],boxes_text[jj],slope_first)
                 cnt_clean_rot = textline_contours_postprocessing(all_text_region_raw, slope_for_all, contours_par_per_process[mv], boxes_text[mv], slope_first)
 
                 textlines_rectangles_per_each_subprocess.append(cnt_clean_rot)
@@ -1829,10 +1785,8 @@ class eynollah:
         try:
             
             try:
-                ###id_indexer=id_indexer
                 id_indexer_l=id_indexer_l
             except:
-                ###id_indexer=0
                 id_indexer_l=0
             for mm in range(len(found_polygons_marginals)):
                 textregion=ET.SubElement(page, 'TextRegion')
@@ -2191,10 +2145,8 @@ class eynollah:
             #id_indexer_l=0
             
             try:
-                ###id_indexer=id_indexer
                 id_indexer_l = id_indexer_l
             except:
-                ###id_indexer=0
                 id_indexer_l = 0
     
             for mm in range(len(found_polygons_marginals)):
@@ -2397,7 +2349,6 @@ class eynollah:
         del model_region
         del session_region
         gc.collect()
-        ###K.clear_session()
 
         model_region, session_region = self.start_new_session_and_model(self.model_region_dir_p2)
 
@@ -2438,7 +2389,6 @@ class eynollah:
         del model_region
         del session_region
         gc.collect()
-        ###K.clear_session()
 
         mask_zeros2=(prediction_regions_org2[:,:,0]==0)*1
         mask_lines2=(prediction_regions_org2[:,:,0]==3)*1
@@ -2469,67 +2419,6 @@ class eynollah:
         del mask_zeros2
         del prediction_regions_org2
 
-        #if is_image_enhanced:
-            #pass
-        #else:
-            #model_region, session_region = self.start_new_session_and_model(self.model_region_dir_p2)
-
-            #gaussian_filter=False
-            #patches=True
-            #binary=False
-
-
-
-
-            #ratio_x=1
-            #ratio_y=1
-            #median_blur=False
-
-            #img= resize_image(img_org, int(img_org.shape[0]*ratio_y), int(img_org.shape[1]*ratio_x))
-
-            #if binary:
-                #img = self.otsu_copy_binary(img)#self.otsu_copy(img)
-                #img = img.astype(np.uint16)
-
-            #if median_blur:
-                #img=cv2.medianBlur(img,5)
-            #if gaussian_filter:
-                #img= cv2.GaussianBlur(img,(5,5),0)
-                #img = img.astype(np.uint16)
-            #prediction_regions_org2=self.do_prediction(patches,img,model_region)
-
-            #prediction_regions_org2=resize_image(prediction_regions_org2, img_height_h, img_width_h )
-
-            ##plt.imshow(prediction_regions_org2[:,:,0])
-            ##plt.show()
-            ##sys.exit()
-            ###prediction_regions_org=prediction_regions_org[:,:,0]
-
-            #session_region.close()
-            #del model_region
-            #del session_region
-            #gc.collect()
-            ####K.clear_session()
-
-            #mask_zeros2=(prediction_regions_org2[:,:,0]==0)*1
-            #mask_lines2=(prediction_regions_org2[:,:,0]==3)*1
-
-            #text_sume_early=( (prediction_regions_org[:,:]==1)*1 ).sum()
-
-
-            #prediction_regions_org[(prediction_regions_org[:,:]==1) & (mask_zeros2[:,:]==1)]=0
-
-            ###prediction_regions_org[mask_lines2[:,:]==1]=3
-            #prediction_regions_org[(mask_lines2[:,:]==1) & (prediction_regions_org[:,:]==0)]=3
-
-            #text_sume_second=( (prediction_regions_org[:,:]==1)*1 ).sum()
-
-            #print(text_sume_second/float(text_sume_early)*100,'twomodelsratio')
-
-            #del mask_lines2
-            #del mask_zeros2
-            #del prediction_regions_org2
-
         mask_lines_only=(prediction_regions_org[:,:]==3)*1
 
         prediction_regions_org = cv2.erode(prediction_regions_org[:,:], self.kernel, iterations=2)
@@ -2538,38 +2427,21 @@ class eynollah:
         #plt.show()
 
         prediction_regions_org = cv2.dilate(prediction_regions_org[:,:], self.kernel, iterations=2)
-
         mask_texts_only=(prediction_regions_org[:,:]==1)*1
-
         mask_images_only=(prediction_regions_org[:,:]==2)*1
 
-
-
         pixel_img=1
         min_area_text=0.00001
         polygons_of_only_texts=return_contours_of_interested_region(mask_texts_only,pixel_img,min_area_text)
-
         polygons_of_only_images=return_contours_of_interested_region(mask_images_only,pixel_img)
-
         polygons_of_only_lines=return_contours_of_interested_region(mask_lines_only,pixel_img,min_area_text)
 
-
         text_regions_p_true=np.zeros(prediction_regions_org.shape)
-        #text_regions_p_true[:,:]=text_regions_p_1[:,:]
-
         text_regions_p_true=cv2.fillPoly(text_regions_p_true,pts=polygons_of_only_lines, color=(3,3,3))
-
-        ##text_regions_p_true=cv2.fillPoly(text_regions_p_true,pts=polygons_of_only_images, color=(2,2,2))
         text_regions_p_true[:,:][mask_images_only[:,:]==1]=2
 
         text_regions_p_true=cv2.fillPoly(text_regions_p_true,pts=polygons_of_only_texts, color=(1,1,1))
 
-        ##print(np.unique(text_regions_p_true))
-
-
-        #text_regions_p_true_3d=np.repeat(text_regions_p_1[:, :, np.newaxis], 3, axis=2)
-        #text_regions_p_true_3d=text_regions_p_true_3d.astype(np.uint8)
-
         del polygons_of_only_texts
         del polygons_of_only_images
         del polygons_of_only_lines
@@ -2588,9 +2460,6 @@ class eynollah:
     def write_images_into_directory(self, img_contoures, dir_of_cropped_imgs, image_page):
         index = 0
         for cont_ind in img_contoures:
-            # cont_ind[:,0,0]=cont_ind[:,0,0]/self.scale_x
-            # cont_ind[:,0,1]=cont_ind[:,0,1]/self.scale_y
-
             x, y, w, h = cv2.boundingRect(cont_ind)
             box = [x, y, w, h]
             croped_page, page_coord = crop_image_inside_box(box, image_page)
@@ -2710,7 +2579,6 @@ class eynollah:
                 args_contours_h = np.array(range(len(arg_text_con_h)))
 
                 order_by_con_head = np.zeros(len(arg_text_con_h))
-                #####
 
                 ref_point = 0
                 order_of_texts_tot = []
@@ -3015,14 +2883,12 @@ class eynollah:
         img_g3[:, :, 1] = img_g[:, :]
         img_g3[:, :, 2] = img_g[:, :]
 
-        ###self.produce_groundtruth_for_textline()
         image_page, page_coord = self.extract_page()
 
         # print(image_page.shape,'page')
 
         if self.dir_of_all is not None:
             cv2.imwrite(os.path.join(self.dir_of_all, self.image_filename_stem + "_page.png"), image_page)
-        ##########
         K.clear_session()
         gc.collect()
 
@@ -3272,7 +3138,6 @@ class eynollah:
                     else:
                         regions_fully_np = filter_small_drop_capitals_from_no_patch_layout(regions_fully_np, text_regions_p)
 
-                    # regions_fully_np=filter_small_drop_capitals_from_no_patch_layout(regions_fully_np,text_regions_p)
                     # plt.imshow(regions_fully_np[:,:,0])
                     # plt.show()
 
@@ -3288,7 +3153,6 @@ class eynollah:
                     # plt.show()
 
                     text_regions_p[:, :][regions_fully[:, :, 0] == 4] = 4
-                    ##text_regions_p[:,:][(regions_fully[:,:,0]==7) & (text_regions_p[:,:]!=0)]=7
 
                     text_regions_p[:, :][regions_fully_np[:, :, 0] == 4] = 4
 
@@ -3313,8 +3177,6 @@ class eynollah:
 
                     print("full layout in: " + str(time.time() - t1))
 
-                # sys.exit()
-
                     pixel_img = 5
                     polygons_of_images = return_contours_of_interested_region(img_revised_tab, pixel_img)
 
@@ -3322,10 +3184,9 @@ class eynollah:
                 # plt.show()
 
                 # print(img_revised_tab.shape,text_regions_p_1_n.shape)
-
                 # text_regions_p_1_n=resize_image(text_regions_p_1_n,img_revised_tab.shape[0],img_revised_tab.shape[1])
-
                 # print(np.unique(text_regions_p_1_n),'uni')
+
                 text_only = ((img_revised_tab[:, :] == 1)) * 1
                 if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
                     text_only_d = ((text_regions_p_1_n[:, :] == 1)) * 1
@@ -3408,7 +3269,6 @@ class eynollah:
                     for i in range(len(contours_only_text_parent)):
                         # img1=np.zeros((text_only.shape[0],text_only.shape[1],3))
                         # img1=cv2.fillPoly(img1,pts=[contours_only_text_parent[i]] ,color=(1,1,1))
-
                         # plt.imshow(img1[:,:,0])
                         # plt.show()
 
@@ -3425,7 +3285,6 @@ class eynollah:
 
                         # img2=np.zeros((text_only.shape[0],text_only.shape[1],3))
                         # img2=cv2.fillPoly(img2,pts=[contours_only_text_parent_d[np.argmin(dists)]] ,color=(1,1,1))
-
                         # plt.imshow(img2[:,:,0])
                         # plt.show()
 
@@ -3448,21 +3307,13 @@ class eynollah:
                     cx_bigest_big, cy_biggest_big, _, _, _, _, _ = find_new_features_of_contoures([contours_biggest])
                     cx_bigest, cy_biggest, _, _, _, _, _ = find_new_features_of_contoures(contours_only_text_parent)
                     # print(areas_cnt_text_parent,'areas_cnt_text_parent')
-
-                    ###index_con_parents_d=np.argsort(areas_cnt_text_parent_d)
-                    ##contours_only_text_parent_d=list(np.array(contours_only_text_parent_d)[index_con_parents_d])
-                    ###areas_cnt_text_parent_d=list(np.array(areas_cnt_text_parent_d)[index_con_parents_d])
-
-                    ##print(areas_cnt_text_parent_d,'areas_cnt_text_parent_d')
-
+                    # print(areas_cnt_text_parent_d,'areas_cnt_text_parent_d')
                     # print(len(contours_only_text_parent),len(contours_only_text_parent_d),'vizzz')
 
                 txt_con_org = get_textregion_contours_in_org_image(contours_only_text_parent, self.image, slope_first)
 
-                ###boxes_text,_= get_text_region_boxes_by_given_contours(contours_only_text_parent)
                 boxes_text, _ = get_text_region_boxes_by_given_contours(contours_only_text_parent)
                 boxes_marginals, _ = get_text_region_boxes_by_given_contours(polygons_of_marginals)
-                ####boxes_text_h,_= get_text_region_boxes_by_given_contours(text_only_h,contours_only_text_parent_h,image_page)
 
                 if not self.curved_line:
                     slopes, all_found_texline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con = self.get_slopes_and_deskew_new(txt_con_org, contours_only_text_parent, textline_mask_tot_ea, image_page_rotated, boxes_text, slope_deskew)
@@ -3473,14 +3324,10 @@ class eynollah:
                     scale_param = 1
                     all_found_texline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con, slopes = self.get_slopes_and_deskew_new_curved(txt_con_org, contours_only_text_parent, cv2.erode(textline_mask_tot_ea, kernel=self.kernel, iterations=1), image_page_rotated, boxes_text, text_only, num_col_classifier, scale_param, slope_deskew)
 
-                    # all_found_texline_polygons,boxes_text,txt_con_org,contours_only_text_parent,all_box_coord=self.get_slopes_and_deskew_new_curved(txt_con_org,contours_only_text_parent,textline_mask_tot_ea,image_page_rotated,boxes_text,text_only,num_col,scale_param)
                     all_found_texline_polygons = small_textlines_to_parent_adherence2(all_found_texline_polygons, textline_mask_tot_ea, num_col_classifier)
 
-                    # slopes=list(np.zeros(len(contours_only_text_parent)))
-
                     all_found_texline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, index_by_text_par_con_marginal, slopes_marginals = self.get_slopes_and_deskew_new_curved(polygons_of_marginals, polygons_of_marginals, cv2.erode(textline_mask_tot_ea, kernel=self.kernel, iterations=1), image_page_rotated, boxes_marginals, text_only, num_col_classifier, scale_param, slope_deskew)
 
-                    # all_found_texline_polygons,boxes_text,txt_con_org,contours_only_text_parent,all_box_coord=self.get_slopes_and_deskew_new_curved(txt_con_org,contours_only_text_parent,textline_mask_tot_ea,image_page_rotated,boxes_text,text_only,num_col,scale_param)
                     all_found_texline_polygons_marginals = small_textlines_to_parent_adherence2(all_found_texline_polygons_marginals, textline_mask_tot_ea, num_col_classifier)
 
                 index_of_vertical_text_contours = np.array(range(len(slopes)))[(abs(np.array(slopes)) > 60)]
@@ -3490,23 +3337,9 @@ class eynollah:
                 K.clear_session()
                 gc.collect()
 
-                # contours_only_text_parent_d_ordered=list(np.array(contours_only_text_parent_d_ordered)[index_by_text_par_con])
-                ###print(index_by_text_par_con,'index_by_text_par_con')
+                # print(index_by_text_par_con,'index_by_text_par_con')
 
                 if self.full_layout:
-                    ##for iii in range(len(contours_only_text_parent)):
-                    ##img1=np.zeros((text_only.shape[0],text_only.shape[1],3))
-                    ##img1=cv2.fillPoly(img1,pts=[contours_only_text_parent[iii]] ,color=(1,1,1))
-
-                    ##plt.imshow(img1[:,:,0])
-                    ##plt.show()
-
-                    ##img2=np.zeros((text_only.shape[0],text_only.shape[1],3))
-                    ##img2=cv2.fillPoly(img2,pts=[contours_only_text_parent_d_ordered[iii]] ,color=(1,1,1))
-
-                    ##plt.imshow(img2[:,:,0])
-                    ##plt.show()
-
                     if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
                         contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered)[index_by_text_par_con])
 
@@ -3516,11 +3349,6 @@ class eynollah:
 
                         text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_texline_polygons, all_found_texline_polygons_h, slopes, slopes_h, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_texline_polygons, slopes, contours_only_text_parent_d_ordered)
 
-                    ###text_regions_p,contours_only_text_parent,contours_only_text_parent_h,all_box_coord,all_box_coord_h,all_found_texline_polygons,all_found_texline_polygons_h=check_any_text_region_in_model_one_is_main_or_header(text_regions_p,regions_fully,contours_only_text_parent,all_box_coord,all_found_texline_polygons)
-                    # text_regions_p=self.return_region_segmentation_after_implementing_not_head_maintext_parallel(text_regions_p,boxes)
-
-                    # if you want to save the layout result just uncommet following plot
-
                     if self.dir_of_layout is not None:
                         self.save_plot_of_layout(text_regions_p, image_page)
                     if self.dir_of_all is not None:
@@ -3535,7 +3363,6 @@ class eynollah:
 
                     pixel_img = 4
                     polygons_of_drop_capitals = return_contours_of_interested_region_by_min_size(text_regions_p, pixel_img)
-                    # polygons_of_drop_capitals=[]
 
                     all_found_texline_polygons = adhere_drop_capital_region_into_cprresponding_textline(text_regions_p, polygons_of_drop_capitals, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_texline_polygons, all_found_texline_polygons_h, kernel=self.kernel, curved_line=self.curved_line)
 
@@ -3554,9 +3381,7 @@ class eynollah:
                             num_col_d, peaks_neg_fin_d, matrix_of_lines_ch_d, spliter_y_new_d, seperators_closeup_n_d = find_number_of_columns_in_document(np.repeat(text_regions_p_1_n[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines)
 
                     # print(peaks_neg_fin,peaks_neg_fin_d,'num_col2')
-
                     # print(spliter_y_new,spliter_y_new_d,'num_col_classifier')
-
                     # print(matrix_of_lines_ch.shape,matrix_of_lines_ch_d.shape,'matrix_of_lines_ch')
 
                     if num_col_classifier >= 3:

From 52df6972ad588aca1d2bc19732006c3b747697e1 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Wed, 27 Jan 2021 12:52:51 +0100
Subject: [PATCH 04/89] rename: image_{dir,filename},
 {f_name,image_filename_stem}

---
 sbb_newspapers_org_image/eynollah.py          | 34 +------
 .../utils/separate_lines.py                   | 99 +------------------
 2 files changed, 10 insertions(+), 123 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index 827d9ce..aaf1dbf 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -1,3 +1,4 @@
+# pylint: disable=no-member
 """
 tool to extract table form data from alto xml data
 """
@@ -200,15 +201,8 @@ class eynollah:
             nxf = img_w / float(width_mid)
             nyf = img_h / float(height_mid)
 
-            if nxf > int(nxf):
-                nxf = int(nxf) + 1
-            else:
-                nxf = int(nxf)
-
-            if nyf > int(nyf):
-                nyf = int(nyf) + 1
-            else:
-                nyf = int(nyf)
+            nxf = int(nxf) + 1 if nxf > int(nxf) else int(nxf)
+            nyf = int(nyf) + 1 if nyf > int(nyf) else int(nyf)
 
             for i in range(nxf):
                 for j in range(nyf):
@@ -295,7 +289,6 @@ class eynollah:
         return int(float(dpi))
 
     def resize_image_with_column_classifier(self, is_image_enhanced):
-        dpi = self.check_dpi()
         img = cv2.imread(self.image_filename)
         img = img.astype(np.uint8)
 
@@ -540,11 +533,6 @@ class eynollah:
             image_res = self.predict_enhancement(img_new)
             # cv2.imwrite(os.path.join(self.dir_out, self.image_filename_stem) + ".tif",self.image)
             # self.image=self.image.astype(np.uint16)
-
-            # self.scale_x=1
-            # self.scale_y=1
-            # self.height_org = self.image.shape[0]
-            # self.width_org = self.image.shape[1]
             is_image_enhanced = True
         else:
             is_image_enhanced = False
@@ -570,8 +558,6 @@ class eynollah:
 
     def get_image_and_scales_after_enhancing(self, img_org, img_res):
 
-        # self.image = cv2.imread(self.image_filename)
-
         self.image = np.copy(img_res)
         self.image = self.image.astype(np.uint8)
         self.image_org = np.copy(img_org)
@@ -630,15 +616,8 @@ class eynollah:
             nxf = img_w / float(width_mid)
             nyf = img_h / float(height_mid)
 
-            if nxf > int(nxf):
-                nxf = int(nxf) + 1
-            else:
-                nxf = int(nxf)
-
-            if nyf > int(nyf):
-                nyf = int(nyf) + 1
-            else:
-                nyf = int(nyf)
+            nxf = int(nxf) + 1 if nxf > int(nxf) else int(nxf)
+            nyf = int(nyf) + 1 if nyf > int(nyf) else int(nyf)
 
             for i in range(nxf):
                 for j in range(nyf):
@@ -665,11 +644,8 @@ class eynollah:
                         index_y_d = img_h - img_height_model
 
                     img_patch = img[index_y_d:index_y_u, index_x_d:index_x_u, :]
-
                     label_p_pred = model.predict(img_patch.reshape(1, img_patch.shape[0], img_patch.shape[1], img_patch.shape[2]))
-
                     seg = np.argmax(label_p_pred, axis=3)[0]
-
                     seg_color = np.repeat(seg[:, :, np.newaxis], 3, axis=2)
 
                     if i == 0 and j == 0:
diff --git a/sbb_newspapers_org_image/utils/separate_lines.py b/sbb_newspapers_org_image/utils/separate_lines.py
index d339707..eb68bdf 100644
--- a/sbb_newspapers_org_image/utils/separate_lines.py
+++ b/sbb_newspapers_org_image/utils/separate_lines.py
@@ -1485,7 +1485,7 @@ def textline_contours_postprocessing(textline_mask, slope, contour_text_interest
 
     return contours_rotated_clean
 
-def seperate_lines_new2(img_path, thetha, num_col, slope_region, dir_of_all, f_name):
+def seperate_lines_new2(img_path, thetha, num_col, slope_region, dir_of_all, image_filename_stem):
 
     if num_col == 1:
         num_patches = int(img_path.shape[1] / 200.0)
@@ -1536,7 +1536,7 @@ def seperate_lines_new2(img_path, thetha, num_col, slope_region, dir_of_all, f_n
 
         sigma = 2
         try:
-            slope_xline = return_deskew_slop(img_xline, sigma, dir_of_all=dir_of_all, f_name=f_name)
+            slope_xline = return_deskew_slop(img_xline, sigma, dir_of_all=dir_of_all, image_filename_stem=image_filename_stem)
         except:
             slope_xline = 0
 
@@ -1593,7 +1593,7 @@ def seperate_lines_new2(img_path, thetha, num_col, slope_region, dir_of_all, f_n
     # plt.show()
     return img_patch_ineterst_revised
 
-def return_deskew_slop(img_patch_org, sigma_des, main_page=False, dir_of_all=None, f_name=None):
+def return_deskew_slop(img_patch_org, sigma_des, main_page=False, dir_of_all=None, image_filename_stem=None):
 
 
     if main_page and dir_of_all is not None:
@@ -1610,7 +1610,7 @@ def return_deskew_slop(img_patch_org, sigma_des, main_page=False, dir_of_all=Non
         plt.yticks([0,len(gaussian_filter1d(img_patch_org.sum(axis=1), 3))])
         plt.gca().invert_yaxis()
 
-        plt.savefig(os.path.join(dir_of_all, f_name+'_density_of_textline.png'))
+        plt.savefig(os.path.join(dir_of_all, image_filename_stem+'_density_of_textline.png'))
     #print(np.max(img_patch_org.sum(axis=0)) ,np.max(img_patch_org.sum(axis=1)),'axislar')
 
     #img_patch_org=resize_image(img_patch_org,int(img_patch_org.shape[0]*2.5),int(img_patch_org.shape[1]/2.5))
@@ -1647,53 +1647,23 @@ def return_deskew_slop(img_patch_org, sigma_des, main_page=False, dir_of_all=Non
         #plt.show()
         angels=np.array([-45, 0 , 45 , 90 , ])#np.linspace(-12,12,100)#np.array([0 , 45 , 90 , -45])
 
-        #res=[]
-        #num_of_peaks=[]
-        #index_cor=[]
         var_res=[]
 
-        #indexer=0
         for rot in angels:
             img_rot=rotate_image(img_resized,rot)
             #plt.imshow(img_rot)
             #plt.show()
             img_rot[img_rot!=0]=1
-            #res_me=np.mean(self.find_num_col_deskew(img_rot,sigma_des,2.0  ))
-
-
             #neg_peaks,var_spectrum=self.find_num_col_deskew(img_rot,sigma_des,20.3  )
             #print(var_spectrum,'var_spectrum')
             try:
                 var_spectrum=find_num_col_deskew(img_rot,sigma_des,20.3  )
                 ##print(rot,var_spectrum,'var_spectrum')
-                #res_me=np.mean(neg_peaks)
-                #if res_me==0:
-                    #res_me=1000000000000000000000
-                #else:
-                    #pass
-
-                #res_num=len(neg_peaks)
             except:
-                #res_me=1000000000000000000000
-                #res_num=0
                 var_spectrum=0
-            #if self.isNaN(res_me):
-                #pass
-            #else:
-                #res.append( res_me )
-                #var_res.append(var_spectrum)
-                #num_of_peaks.append( res_num )
-                #index_cor.append(indexer)
-            #indexer=indexer+1
-
             var_res.append(var_spectrum)
-            #index_cor.append(indexer)
-            #indexer=indexer+1
-
-
         try:
             var_res=np.array(var_res)
-
             ang_int=angels[np.argmax(var_res)]#angels_sorted[arg_final]#angels[arg_sort_early[arg_sort[arg_final]]]#angels[arg_fin]
         except:
             ang_int=0
@@ -1701,32 +1671,19 @@ def return_deskew_slop(img_patch_org, sigma_des, main_page=False, dir_of_all=Non
 
         angels=np.linspace(ang_int-22.5,ang_int+22.5,100)
 
-        #res=[]
-        #num_of_peaks=[]
-        #index_cor=[]
         var_res=[]
-
-
         for rot in angels:
             img_rot=rotate_image(img_resized,rot)
             ##plt.imshow(img_rot)
             ##plt.show()
             img_rot[img_rot!=0]=1
-            #res_me=np.mean(self.find_num_col_deskew(img_rot,sigma_des,2.0  ))
             try:
                 var_spectrum=find_num_col_deskew(img_rot,sigma_des,20.3  )
-
             except:
                 var_spectrum=0
-
             var_res.append(var_spectrum)
-
-
-
-
         try:
             var_res=np.array(var_res)
-
             ang_int=angels[np.argmax(var_res)]#angels_sorted[arg_final]#angels[arg_sort_early[arg_sort[arg_final]]]#angels[arg_fin]
         except:
             ang_int=0
@@ -1745,9 +1702,6 @@ def return_deskew_slop(img_patch_org, sigma_des, main_page=False, dir_of_all=Non
             #plt.imshow(img_rot)
             #plt.show()
             img_rot[img_rot!=0]=1
-            #res_me=np.mean(self.find_num_col_deskew(img_rot,sigma_des,2.0  ))
-
-
             #neg_peaks,var_spectrum=self.find_num_col_deskew(img_rot,sigma_des,20.3  )
             #print(var_spectrum,'var_spectrum')
             try:
@@ -1769,41 +1723,29 @@ def return_deskew_slop(img_patch_org, sigma_des, main_page=False, dir_of_all=Non
 
             plt.plot(angels[np.argmax(var_res)],var_res[np.argmax(np.array(var_res))]  ,'*',markersize=50,label='Angle of deskewing=' +str("{:.2f}".format(angels[np.argmax(var_res)]))+r'$\degree$')
             plt.legend(loc='best')
-            plt.savefig(os.path.join(dir_of_all,f_name+'_rotation_angle.png'))
-
-
+            plt.savefig(os.path.join(dir_of_all,image_filename_stem+'_rotation_angle.png'))
         try:
             var_res=np.array(var_res)
-
             ang_int=angels[np.argmax(var_res)]#angels_sorted[arg_final]#angels[arg_sort_early[arg_sort[arg_final]]]#angels[arg_fin]
         except:
             ang_int=0
 
-
         early_slope_edge=11
         if abs(ang_int)>early_slope_edge and ang_int<0:
-
             angels=np.linspace(-90,-12,100)
-
             var_res=[]
-
             for rot in angels:
                 img_rot=rotate_image(img_resized,rot)
                 ##plt.imshow(img_rot)
                 ##plt.show()
                 img_rot[img_rot!=0]=1
-                #res_me=np.mean(self.find_num_col_deskew(img_rot,sigma_des,2.0  ))
                 try:
                     var_spectrum=find_num_col_deskew(img_rot,sigma_des,20.3  )
                 except:
                     var_spectrum=0
-
                 var_res.append(var_spectrum)
-
-
             try:
                 var_res=np.array(var_res)
-
                 ang_int=angels[np.argmax(var_res)]#angels_sorted[arg_final]#angels[arg_sort_early[arg_sort[arg_final]]]#angels[arg_fin]
             except:
                 ang_int=0
@@ -1811,67 +1753,47 @@ def return_deskew_slop(img_patch_org, sigma_des, main_page=False, dir_of_all=Non
         elif abs(ang_int)>early_slope_edge and ang_int>0:
 
             angels=np.linspace(90,12,100)
-
-
             var_res=[]
-
             for rot in angels:
                 img_rot=rotate_image(img_resized,rot)
                 ##plt.imshow(img_rot)
                 ##plt.show()
                 img_rot[img_rot!=0]=1
-                #res_me=np.mean(self.find_num_col_deskew(img_rot,sigma_des,2.0  ))
                 try:
                     var_spectrum=find_num_col_deskew(img_rot,sigma_des,20.3  )
                     #print(indexer,'indexer')
                 except:
                     var_spectrum=0
-
                 var_res.append(var_spectrum)
-
-
             try:
                 var_res=np.array(var_res)
-
                 ang_int=angels[np.argmax(var_res)]#angels_sorted[arg_final]#angels[arg_sort_early[arg_sort[arg_final]]]#angels[arg_fin]
             except:
                 ang_int=0
     else:
-
-
         angels=np.linspace(-25,25,60)
-
         var_res=[]
-
         indexer=0
         for rot in angels:
             img_rot=rotate_image(img_resized,rot)
             #plt.imshow(img_rot)
             #plt.show()
             img_rot[img_rot!=0]=1
-            #res_me=np.mean(self.find_num_col_deskew(img_rot,sigma_des,2.0  ))
-
-
             #neg_peaks,var_spectrum=self.find_num_col_deskew(img_rot,sigma_des,20.3  )
             #print(var_spectrum,'var_spectrum')
             try:
                 var_spectrum=find_num_col_deskew(img_rot,sigma_des,20.3  )
             except:
                 var_spectrum=0
-
             var_res.append(var_spectrum)
-
-
         try:
             var_res=np.array(var_res)
-
             ang_int=angels[np.argmax(var_res)]#angels_sorted[arg_final]#angels[arg_sort_early[arg_sort[arg_final]]]#angels[arg_fin]
         except:
             ang_int=0
 
         #plt.plot(var_res)
         #plt.show()
-
         ##plt.plot(mom3_res)
         ##plt.show()
         #print(ang_int,'ang_int111')
@@ -1888,20 +1810,14 @@ def return_deskew_slop(img_patch_org, sigma_des, main_page=False, dir_of_all=Non
                 ##plt.imshow(img_rot)
                 ##plt.show()
                 img_rot[img_rot!=0]=1
-                #res_me=np.mean(self.find_num_col_deskew(img_rot,sigma_des,2.0  ))
                 try:
                     var_spectrum=find_num_col_deskew(img_rot,sigma_des,20.3  )
-
                 except:
                     var_spectrum=0
-
                 var_res.append(var_spectrum)
 
-
-
             try:
                 var_res=np.array(var_res)
-
                 ang_int=angels[np.argmax(var_res)]#angels_sorted[arg_final]#angels[arg_sort_early[arg_sort[arg_final]]]#angels[arg_fin]
             except:
                 ang_int=0
@@ -1918,7 +1834,6 @@ def return_deskew_slop(img_patch_org, sigma_des, main_page=False, dir_of_all=Non
                 ##plt.imshow(img_rot)
                 ##plt.show()
                 img_rot[img_rot!=0]=1
-                #res_me=np.mean(self.find_num_col_deskew(img_rot,sigma_des,2.0  ))
                 try:
                     var_spectrum=find_num_col_deskew(img_rot,sigma_des,20.3  )
                     #print(indexer,'indexer')
@@ -1926,12 +1841,8 @@ def return_deskew_slop(img_patch_org, sigma_des, main_page=False, dir_of_all=Non
                     var_spectrum=0
 
                 var_res.append(var_spectrum)
-
-
-
             try:
                 var_res=np.array(var_res)
-
                 ang_int=angels[np.argmax(var_res)]#angels_sorted[arg_final]#angels[arg_sort_early[arg_sort[arg_final]]]#angels[arg_fin]
             except:
                 ang_int=0

From 4a5c99008a5894671f40497f3c1015e538f00b4e Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Thu, 28 Jan 2021 15:10:06 +0100
Subject: [PATCH 05/89] eynollah: define self.cont_page in __init__, extract
 page coord calculation

---
 sbb_newspapers_org_image/eynollah.py | 55 ++++++++++------------------
 1 file changed, 19 insertions(+), 36 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index aaf1dbf..f3abfd3 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -139,6 +139,7 @@ class eynollah:
         headers_off=False
     ):
         self.image_filename = image_filename  # XXX This does not seem to be a directory as the name suggests, but a file
+        self.cont_page = []
         self.dir_out = dir_out
         self.image_filename_stem = image_filename_stem
         self.dir_of_cropped_images = dir_of_cropped_images
@@ -821,7 +822,6 @@ class eynollah:
 
         croped_page, page_coord = crop_image_inside_box(box, self.image)
 
-        self.cont_page = []
         self.cont_page.append(np.array([[page_coord[2], page_coord[0]], [page_coord[3], page_coord[0]], [page_coord[3], page_coord[1]], [page_coord[2], page_coord[1]]]))
 
         session_page.close()
@@ -1414,24 +1414,7 @@ class eynollah:
 
         page_print_sub = ET.SubElement(page, "PrintSpace")
         coord_page = ET.SubElement(page_print_sub, "Coords")
-        points_page_print = ""
-
-        for lmm in range(len(self.cont_page[0])):
-            if len(self.cont_page[0][lmm])==2:
-                points_page_print=points_page_print+str( int( (self.cont_page[0][lmm][0])/self.scale_x ) )
-                points_page_print=points_page_print+','
-                points_page_print=points_page_print+str( int( (self.cont_page[0][lmm][1])/self.scale_y ) )
-            else:
-                points_page_print=points_page_print+str( int((self.cont_page[0][lmm][0][0])/self.scale_x) )
-                points_page_print=points_page_print+','
-                points_page_print=points_page_print+str( int((self.cont_page[0][lmm][0][1])/self.scale_y) )
-
-            if lmm<(len(self.cont_page[0])-1):
-                points_page_print=points_page_print+' '
-        coord_page.set('points',points_page_print)
-        
-        
-
+        coord_page.set('points', self.calculate_page_coords())
 
         if len(contours)>0:
             region_order=ET.SubElement(page, 'ReadingOrder')
@@ -1928,6 +1911,22 @@ class eynollah:
         tree.write(os.path.join(dir_of_image, self.image_filename_stem) + ".xml")
     
 
+    def calculate_page_coords(self):
+        points_page_print = ""
+        for lmm in range(len(self.cont_page[0])):
+            if len(self.cont_page[0][lmm]) == 2:
+                points_page_print = points_page_print + str(int((self.cont_page[0][lmm][0] ) / self.scale_x))
+                points_page_print = points_page_print + ','
+                points_page_print = points_page_print + str(int((self.cont_page[0][lmm][1] ) / self.scale_y))
+            else:
+                points_page_print = points_page_print + str(int((self.cont_page[0][lmm][0][0]) / self.scale_x))
+                points_page_print = points_page_print + ','
+                points_page_print = points_page_print + str(int((self.cont_page[0][lmm][0][1] ) / self.scale_y))
+
+            if lmm < (len( self.cont_page[0] ) - 1):
+                points_page_print = points_page_print + ' '
+        return points_page_print
+
     def write_into_page_xml(self, contours, page_coord, dir_of_image, order_of_texts, id_of_texts, all_found_texline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, curved_line, slopes, slopes_marginals):
 
         found_polygons_text_region = contours
@@ -1937,27 +1936,11 @@ class eynollah:
         pcgts, page = create_page_xml(self.image_filename, self.height_org, self.width_org)
         page_print_sub = ET.SubElement(page, "PrintSpace")
         coord_page = ET.SubElement(page_print_sub, "Coords")
-        points_page_print = ""
-
-        for lmm in range(len(self.cont_page[0])):
-            if len(self.cont_page[0][lmm]) == 2:
-                points_page_print = points_page_print + str( int( ( self.cont_page[0][lmm][0] ) / self.scale_x ) )
-                points_page_print = points_page_print + ','
-                points_page_print = points_page_print + str( int( ( self.cont_page[0][lmm][1] ) / self.scale_y ) )
-            else:
-                points_page_print = points_page_print + str( int( ( self.cont_page[0][lmm][0][0]) / self.scale_x ) )
-                points_page_print=points_page_print + ','
-                points_page_print=points_page_print + str( int( ( self.cont_page[0][lmm][0][1] ) / self.scale_y) )
-
-            if lmm < (len( self.cont_page[0] ) - 1):
-                points_page_print = points_page_print + ' '
-        coord_page.set( 'points', points_page_print )
-        
+        coord_page.set('points', self.calculate_page_coords())
 
         if len(contours) > 0:
             region_order = ET.SubElement(page, 'ReadingOrder')
             region_order_sub = ET.SubElement(region_order, 'OrderedGroup')
-            
             region_order_sub.set('id',"ro357564684568544579089")
 
             indexer_region=0

From e11d9adfea65076e6f6f47b0f05bace9a550e03d Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Thu, 28 Jan 2021 17:23:43 +0100
Subject: [PATCH 06/89] outfactor serialize_lines_in_region

---
 sbb_newspapers_org_image/eynollah.py | 275 ++++++++-------------------
 1 file changed, 84 insertions(+), 191 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index f3abfd3..e82078e 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -1404,6 +1404,83 @@ class eynollah:
         poly.put(poly_sub)
         box_sub.put(boxes_sub_new)
 
+    def serialize_lines_in_region(self, textregion, all_found_texline_polygons, region_idx, page_coord, all_box_coord, slopes):
+        for j in range(len(all_found_texline_polygons[region_idx])):
+            textline=ET.SubElement(textregion, 'TextLine')
+            textline.set('id','l'+str(id_indexer_l))
+            id_indexer_l+=1
+            coord = ET.SubElement(textline, 'Coords')
+            texteq = ET.SubElement(textline, 'TextEquiv')
+            uni = ET.SubElement(texteq, 'Unicode')
+            uni.text = ' ' 
+
+            #points = ET.SubElement(coord, 'Points') 
+
+            points_co=''
+            for l in range(len(all_found_texline_polygons[region_idx][j])):
+                if not self.curved_line:
+                    #point.set('x',str(found_polygons[j][l][0]))  
+                    #point.set('y',str(found_polygons[j][l][1]))
+                    if len(all_found_texline_polygons[region_idx][j][l])==2:
+                        textline_x_coord=int( (all_found_texline_polygons[region_idx][j][l][0]
+                                                +all_box_coord[region_idx][2]+page_coord[2])/self.scale_x)
+                        textline_y_coord=int( (all_found_texline_polygons[region_idx][j][l][1] 
+                                                +all_box_coord[region_idx][0]+page_coord[0])/self.scale_y)
+                        
+                        if textline_x_coord<0:
+                            textline_x_coord=0
+                        if textline_y_coord<0:
+                            textline_y_coord=0
+                        points_co=points_co+str( textline_x_coord )
+                        points_co=points_co+','
+                        points_co=points_co+str( textline_y_coord )
+                    else:
+                        
+                        textline_x_coord=int( ( all_found_texline_polygons[region_idx][j][l][0][0] 
+                                                +all_box_coord[region_idx][2]+page_coord[2])/self.scale_x )
+                        textline_y_coord=int( ( all_found_texline_polygons[region_idx][j][l][0][1] 
+                                                +all_box_coord[region_idx][0]+page_coord[0])/self.scale_y)
+                        
+                        if textline_x_coord<0:
+                            textline_x_coord=0
+                        if textline_y_coord<0:
+                            textline_y_coord=0
+                            
+                        points_co=points_co+str( textline_x_coord )
+                        points_co=points_co+','
+                        points_co=points_co+str( textline_y_coord ) 
+                                        
+                if (self.curved_line) and np.abs(slopes[region_idx]) <= 45 :
+                    if len(all_found_texline_polygons[region_idx][j][l])==2:
+                        points_co=points_co+str( int( (all_found_texline_polygons[region_idx][j][l][0]
+                                                +page_coord[2])/self.scale_x) )
+                        points_co=points_co+','
+                        points_co=points_co+str( int( (all_found_texline_polygons[region_idx][j][l][1] 
+                                                +page_coord[0])/self.scale_y) )
+                    else:
+                        points_co=points_co+str( int( ( all_found_texline_polygons[region_idx][j][l][0][0] 
+                                                +page_coord[2])/self.scale_x ) )
+                        points_co=points_co+','
+                        points_co=points_co+str( int( ( all_found_texline_polygons[region_idx][j][l][0][1] 
+                                                +page_coord[0])/self.scale_y) )
+                elif (self.curved_line) and np.abs(slopes[region_idx]) > 45 :
+                    if len(all_found_texline_polygons[region_idx][j][l])==2:
+                        points_co=points_co+str( int( (all_found_texline_polygons[region_idx][j][l][0]
+                                                +all_box_coord[region_idx][2]+page_coord[2])/self.scale_x) )
+                        points_co=points_co+','
+                        points_co=points_co+str( int( (all_found_texline_polygons[region_idx][j][l][1] 
+                                                +all_box_coord[region_idx][0]+page_coord[0])/self.scale_y) )
+                    else:
+                        points_co=points_co+str( int( ( all_found_texline_polygons[region_idx][j][l][0][0] 
+                                                +all_box_coord[region_idx][2]+page_coord[2])/self.scale_x ) )
+                        points_co=points_co+','
+                        points_co=points_co+str( int( ( all_found_texline_polygons[region_idx][j][l][0][1] 
+                                                +all_box_coord[region_idx][0]+page_coord[0])/self.scale_y) )
+
+                if l<(len(all_found_texline_polygons[region_idx][j])-1):
+                    points_co=points_co+' '
+            coord.set('points',points_co)
+
     def write_into_page_xml_full(self, contours, contours_h, page_coord, dir_of_image, order_of_texts, id_of_texts, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals):
 
         found_polygons_text_region = contours
@@ -1460,110 +1537,20 @@ class eynollah:
                 points_co=''
                 for lmm in range(len(found_polygons_text_region[mm])):
                     if len(found_polygons_text_region[mm][lmm])==2:
-                        points_co=points_co+str( int( (found_polygons_text_region[mm][lmm][0] +page_coord[2])/self.scale_x ) )
+                        points_co=points_co+str(int((found_polygons_text_region[mm][lmm][0] + page_coord[2]) / self.scale_x))
                         points_co=points_co+','
-                        points_co=points_co+str( int( (found_polygons_text_region[mm][lmm][1] +page_coord[0])/self.scale_y ) )
+                        points_co=points_co+str(int((found_polygons_text_region[mm][lmm][1] + page_coord[0]) / self.scale_y))
                     else:
-                        points_co=points_co+str( int((found_polygons_text_region[mm][lmm][0][0] +page_coord[2])/self.scale_x) )
+                        points_co=points_co+str(int((found_polygons_text_region[mm][lmm][0][0] + page_coord[2]) / self.scale_x))
                         points_co=points_co+','
-                        points_co=points_co+str( int((found_polygons_text_region[mm][lmm][0][1] +page_coord[0])/self.scale_y) )
+                        points_co=points_co+str(int((found_polygons_text_region[mm][lmm][0][1] + page_coord[0]) / self.scale_y))
     
                     if lmm<(len(found_polygons_text_region[mm])-1):
                         points_co=points_co+' '
                 #print(points_co)
                 coord_text.set('points',points_co)
-                
-                
-                
-                
-                
-                for j in range(len(all_found_texline_polygons[mm])):
-    
-                    textline=ET.SubElement(textregion, 'TextLine')
-                    
-                    textline.set('id','l'+str(id_indexer_l))
-                    
-                    id_indexer_l+=1
-                    
-    
-                    coord = ET.SubElement(textline, 'Coords')
-    
-                    texteq=ET.SubElement(textline, 'TextEquiv')
-    
-                    uni=ET.SubElement(texteq, 'Unicode')
-                    uni.text = ' ' 
-    
-                    #points = ET.SubElement(coord, 'Points') 
-    
-                    points_co=''
-                    for l in range(len(all_found_texline_polygons[mm][j])):
-                        #point = ET.SubElement(coord, 'Point') 
-    
-    
-                        if not self.curved_line:
-                            #point.set('x',str(found_polygons[j][l][0]))  
-                            #point.set('y',str(found_polygons[j][l][1]))
-                            if len(all_found_texline_polygons[mm][j][l])==2:
-                                textline_x_coord=int( (all_found_texline_polygons[mm][j][l][0]
-                                                        +all_box_coord[mm][2]+page_coord[2])/self.scale_x)
-                                textline_y_coord=int( (all_found_texline_polygons[mm][j][l][1] 
-                                                        +all_box_coord[mm][0]+page_coord[0])/self.scale_y)
-                                
-                                if textline_x_coord<0:
-                                    textline_x_coord=0
-                                if textline_y_coord<0:
-                                    textline_y_coord=0
-                                points_co=points_co+str( textline_x_coord )
-                                points_co=points_co+','
-                                points_co=points_co+str( textline_y_coord )
-                            else:
-                                
-                                textline_x_coord=int( ( all_found_texline_polygons[mm][j][l][0][0] 
-                                                        +all_box_coord[mm][2]+page_coord[2])/self.scale_x )
-                                textline_y_coord=int( ( all_found_texline_polygons[mm][j][l][0][1] 
-                                                        +all_box_coord[mm][0]+page_coord[0])/self.scale_y)
-                                
-                                if textline_x_coord<0:
-                                    textline_x_coord=0
-                                if textline_y_coord<0:
-                                    textline_y_coord=0
-                                    
-                                points_co=points_co+str( textline_x_coord )
-                                points_co=points_co+','
-                                points_co=points_co+str( textline_y_coord ) 
-                                                
-                        if (self.curved_line) and np.abs(slopes[mm]) <= 45 :
-                            if len(all_found_texline_polygons[mm][j][l])==2:
-                                points_co=points_co+str( int( (all_found_texline_polygons[mm][j][l][0]
-                                                        +page_coord[2])/self.scale_x) )
-                                points_co=points_co+','
-                                points_co=points_co+str( int( (all_found_texline_polygons[mm][j][l][1] 
-                                                        +page_coord[0])/self.scale_y) )
-                            else:
-                                points_co=points_co+str( int( ( all_found_texline_polygons[mm][j][l][0][0] 
-                                                        +page_coord[2])/self.scale_x ) )
-                                points_co=points_co+','
-                                points_co=points_co+str( int( ( all_found_texline_polygons[mm][j][l][0][1] 
-                                                        +page_coord[0])/self.scale_y) )
-                        elif (self.curved_line) and np.abs(slopes[mm]) > 45 :
-                            if len(all_found_texline_polygons[mm][j][l])==2:
-                                points_co=points_co+str( int( (all_found_texline_polygons[mm][j][l][0]
-                                                        +all_box_coord[mm][2]+page_coord[2])/self.scale_x) )
-                                points_co=points_co+','
-                                points_co=points_co+str( int( (all_found_texline_polygons[mm][j][l][1] 
-                                                        +all_box_coord[mm][0]+page_coord[0])/self.scale_y) )
-                            else:
-                                points_co=points_co+str( int( ( all_found_texline_polygons[mm][j][l][0][0] 
-                                                        +all_box_coord[mm][2]+page_coord[2])/self.scale_x ) )
-                                points_co=points_co+','
-                                points_co=points_co+str( int( ( all_found_texline_polygons[mm][j][l][0][1] 
-                                                        +all_box_coord[mm][0]+page_coord[0])/self.scale_y) )
-    
-                        if l<(len(all_found_texline_polygons[mm][j])-1):
-                            points_co=points_co+' '
-                    #print(points_co)
-                    coord.set('points',points_co)
-                    
+
+                self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord)
                 texteqreg=ET.SubElement(textregion, 'TextEquiv')
     
                 unireg=ET.SubElement(texteqreg, 'Unicode')
@@ -1590,102 +1577,8 @@ class eynollah:
                 #    textregion.set('type','paragraph')
                 coord_text = ET.SubElement(textregion, 'Coords')
 
-                points_co=''
-                for lmm in range(len(found_polygons_text_region_h[mm])):
-
-                    if len(found_polygons_text_region_h[mm][lmm])==2:
-                        
-                        points_co=points_co+str( int( (found_polygons_text_region_h[mm][lmm][0] +page_coord[2])/self.scale_x ) )
-                        points_co=points_co+','
-                        points_co=points_co+str( int( (found_polygons_text_region_h[mm][lmm][1] +page_coord[0])/self.scale_y ) )
-                    else:
-                        points_co=points_co+str( int((found_polygons_text_region_h[mm][lmm][0][0] +page_coord[2])/self.scale_x) )
-                        points_co=points_co+','
-                        points_co=points_co+str( int((found_polygons_text_region_h[mm][lmm][0][1] +page_coord[0])/self.scale_y) )
-
-                    if lmm<(len(found_polygons_text_region_h[mm])-1):
-                        points_co=points_co+' '
-                #print(points_co)
-                coord_text.set('points',points_co)
-                
-                
-                for j in range(len(all_found_texline_polygons_h[mm])):
-    
-                    textline=ET.SubElement(textregion, 'TextLine')
-                    
-                    textline.set('id','l'+str(id_indexer_l))
-                    
-                    id_indexer_l+=1
-                    
-    
-                    coord = ET.SubElement(textline, 'Coords')
-    
-                    texteq=ET.SubElement(textline, 'TextEquiv')
-    
-                    uni=ET.SubElement(texteq, 'Unicode')
-                    uni.text = ' ' 
-    
-                    #points = ET.SubElement(coord, 'Points') 
-    
-                    points_co=''
-                    for l in range(len(all_found_texline_polygons_h[mm][j])):
-                        #point = ET.SubElement(coord, 'Point') 
-    
-    
-                        if not self.curved_line:
-                            #point.set('x',str(found_polygons[j][l][0]))  
-                            #point.set('y',str(found_polygons[j][l][1]))
-                            if len(all_found_texline_polygons_h[mm][j][l])==2:
-                                
-                                textline_x_coord=int( (all_found_texline_polygons_h[mm][j][l][0]
-                                                        +all_box_coord_h[mm][2]+page_coord[2])/self.scale_x)
-                                textline_y_coord=int( (all_found_texline_polygons_h[mm][j][l][1] 
-                                                        +all_box_coord_h[mm][0]+page_coord[0])/self.scale_y)
-                                
-                                if textline_x_coord<0:
-                                    textline_x_coord=0
-                                if textline_y_coord<0:
-                                    textline_y_coord=0
-                                    
-                                points_co=points_co+str( textline_x_coord )
-                                points_co=points_co+','
-                                points_co=points_co+str( textline_y_coord )
-                            else:
-                                
-                                
-                                textline_x_coord=int( ( all_found_texline_polygons_h[mm][j][l][0][0] 
-                                                        +all_box_coord_h[mm][2]+page_coord[2])/self.scale_x )
-                                textline_y_coord=int( ( all_found_texline_polygons_h[mm][j][l][0][1] 
-                                                        +all_box_coord_h[mm][0]+page_coord[0])/self.scale_y)
-                                
-                                if textline_x_coord<0:
-                                    textline_x_coord=0
-                                if textline_y_coord<0:
-                                    textline_y_coord=0
-                                    
-                                points_co=points_co+str( textline_x_coord )
-                                points_co=points_co+','
-                                points_co=points_co+str(  textline_y_coord) 
-                                                
-                        if self.curved_line:
-                            if len(all_found_texline_polygons_h[mm][j][l])==2:
-                                points_co=points_co+str( int( (all_found_texline_polygons_h[mm][j][l][0]
-                                                        +page_coord[2])/self.scale_x) )
-                                points_co=points_co+','
-                                points_co=points_co+str( int( (all_found_texline_polygons_h[mm][j][l][1] 
-                                                        +page_coord[0])/self.scale_y) )
-                            else:
-                                points_co=points_co+str( int( ( all_found_texline_polygons_h[mm][j][l][0][0] 
-                                                        +page_coord[2])/self.scale_x ) )
-                                points_co=points_co+','
-                                points_co=points_co+str( int( ( all_found_texline_polygons_h[mm][j][l][0][1] 
-                                                        +page_coord[0])/self.scale_y) ) 
-    
-                        if l<(len(all_found_texline_polygons_h[mm][j])-1):
-                            points_co=points_co+' '
-                    #print(points_co)
-                    coord.set('points',points_co)
                     
+                self.serialize_lines_in_region(textregion, all_found_texline_polygons_h, mm, page_coord, all_box_coord, slopes)
                 texteqreg=ET.SubElement(textregion, 'TextEquiv')
     
                 unireg=ET.SubElement(texteqreg, 'Unicode')

From 4d31a21c3e6b5b27ec630ba4e0c55893769d7ab0 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Thu, 28 Jan 2021 18:39:16 +0100
Subject: [PATCH 07/89] keep id_indexer_l thru serialize_lines_in_region

---
 sbb_newspapers_org_image/eynollah.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index e82078e..293bbda 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -1404,15 +1404,15 @@ class eynollah:
         poly.put(poly_sub)
         box_sub.put(boxes_sub_new)
 
-    def serialize_lines_in_region(self, textregion, all_found_texline_polygons, region_idx, page_coord, all_box_coord, slopes):
+    def serialize_lines_in_region(self, textregion, all_found_texline_polygons, region_idx, page_coord, all_box_coord, slopes, id_indexer_l):
         for j in range(len(all_found_texline_polygons[region_idx])):
             textline=ET.SubElement(textregion, 'TextLine')
             textline.set('id','l'+str(id_indexer_l))
-            id_indexer_l+=1
+            id_indexer_l += 1
             coord = ET.SubElement(textline, 'Coords')
             texteq = ET.SubElement(textline, 'TextEquiv')
             uni = ET.SubElement(texteq, 'Unicode')
-            uni.text = ' ' 
+            uni.text = ' '
 
             #points = ET.SubElement(coord, 'Points') 
 
@@ -1480,6 +1480,7 @@ class eynollah:
                 if l<(len(all_found_texline_polygons[region_idx][j])-1):
                     points_co=points_co+' '
             coord.set('points',points_co)
+        return id_indexer_l
 
     def write_into_page_xml_full(self, contours, contours_h, page_coord, dir_of_image, order_of_texts, id_of_texts, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals):
 
@@ -1550,7 +1551,7 @@ class eynollah:
                 #print(points_co)
                 coord_text.set('points',points_co)
 
-                self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord)
+                id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, id_indexer_l)
                 texteqreg=ET.SubElement(textregion, 'TextEquiv')
     
                 unireg=ET.SubElement(texteqreg, 'Unicode')
@@ -1578,7 +1579,7 @@ class eynollah:
                 coord_text = ET.SubElement(textregion, 'Coords')
 
                     
-                self.serialize_lines_in_region(textregion, all_found_texline_polygons_h, mm, page_coord, all_box_coord, slopes)
+                id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons_h, mm, page_coord, all_box_coord, slopes, id_indexer_l)
                 texteqreg=ET.SubElement(textregion, 'TextEquiv')
     
                 unireg=ET.SubElement(texteqreg, 'Unicode')

From dfb294eb2588c18669136caa79cfd504d8529b22 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Thu, 28 Jan 2021 18:39:45 +0100
Subject: [PATCH 08/89] outfactor calculate_polygon_coords

---
 sbb_newspapers_org_image/eynollah.py | 161 +++++----------------------
 1 file changed, 29 insertions(+), 132 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index 293bbda..9a04b69 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -1482,6 +1482,23 @@ class eynollah:
             coord.set('points',points_co)
         return id_indexer_l
 
+    def calculate_polygon_coords(self, contour_list, i, j, page_coord):
+        coords = ''
+        for lmm in range(len(contour_list[i])):
+            if len(contour_list[i][j]) == 2:
+                coords += str(int((contour_list[i][j][0] + page_coord[2]) / self.scale_x))
+                coords += ','
+                coords += str(int((contour_list[i][j][1] + page_coord[0]) / self.scale_y))
+            else:
+                coords += str(int((contour_list[i][j][0][0] + page_coord[2]) / self.scale_x))
+                coords += ','
+                coords += str(int((contour_list[i][j][0][1] + page_coord[0]) / self.scale_y))
+
+            if j < len(contour_list[mm]) - 1:
+                coords=coords+' '
+        #print(coords)
+        return coords
+
     def write_into_page_xml_full(self, contours, contours_h, page_coord, dir_of_image, order_of_texts, id_of_texts, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals):
 
         found_polygons_text_region = contours
@@ -1527,29 +1544,11 @@ class eynollah:
     
                 textregion.set('id','r'+str(id_indexer))
                 id_indexer+=1
-                
+
                 textregion.set('type','paragraph')
-                #if mm==0:
-                #    textregion.set('type','header')
-                #else:
-                #    textregion.set('type','paragraph')
                 coord_text = ET.SubElement(textregion, 'Coords')
-                
-                points_co=''
-                for lmm in range(len(found_polygons_text_region[mm])):
-                    if len(found_polygons_text_region[mm][lmm])==2:
-                        points_co=points_co+str(int((found_polygons_text_region[mm][lmm][0] + page_coord[2]) / self.scale_x))
-                        points_co=points_co+','
-                        points_co=points_co+str(int((found_polygons_text_region[mm][lmm][1] + page_coord[0]) / self.scale_y))
-                    else:
-                        points_co=points_co+str(int((found_polygons_text_region[mm][lmm][0][0] + page_coord[2]) / self.scale_x))
-                        points_co=points_co+','
-                        points_co=points_co+str(int((found_polygons_text_region[mm][lmm][0][1] + page_coord[0]) / self.scale_y))
-    
-                    if lmm<(len(found_polygons_text_region[mm])-1):
-                        points_co=points_co+' '
-                #print(points_co)
-                coord_text.set('points',points_co)
+
+                coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region, mm, lmm, page_coord))
 
                 id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, id_indexer_l)
                 texteqreg=ET.SubElement(textregion, 'TextEquiv')
@@ -1607,23 +1606,7 @@ class eynollah:
                 #else:
                 #    textregion.set('type','paragraph')
                 coord_text = ET.SubElement(textregion, 'Coords')
-
-                points_co=''
-                for lmm in range(len(found_polygons_drop_capitals[mm])):
-
-                    if len(found_polygons_drop_capitals[mm][lmm])==2:
-                        points_co=points_co+str( int( (found_polygons_drop_capitals[mm][lmm][0] +page_coord[2])/self.scale_x ) )
-                        points_co=points_co+','
-                        points_co=points_co+str( int( (found_polygons_drop_capitals[mm][lmm][1] +page_coord[0])/self.scale_y ) )
-                    else:
-                        points_co=points_co+str( int((found_polygons_drop_capitals[mm][lmm][0][0] +page_coord[2])/self.scale_x) )
-                        points_co=points_co+','
-                        points_co=points_co+str( int((found_polygons_drop_capitals[mm][lmm][0][1] +page_coord[0])/self.scale_y) )
-
-                    if lmm<(len(found_polygons_drop_capitals[mm])-1):
-                        points_co=points_co+' '
-                #print(points_co)
-                coord_text.set('points',points_co)
+                coord_text.set('points', self.calculate_polygon_coords(found_polygons_drop_capitals, mm, lmm, page_coord)
                 
                     
                 texteqreg=ET.SubElement(textregion, 'TextEquiv')
@@ -1652,24 +1635,8 @@ class eynollah:
                 #else:
                 #    textregion.set('type','paragraph')
                 coord_text = ET.SubElement(textregion, 'Coords')
-                
-                points_co=''
-                for lmm in range(len(found_polygons_marginals[mm])):
-                    if len(found_polygons_marginals[mm][lmm])==2:
-                        points_co=points_co+str( int( (found_polygons_marginals[mm][lmm][0]+page_coord[2] )/self.scale_x ) )
-                        points_co=points_co+','
-                        points_co=points_co+str( int( (found_polygons_marginals[mm][lmm][1]+page_coord[0] )/self.scale_y ) )
-                    else:
-                        points_co=points_co+str( int((found_polygons_marginals[mm][lmm][0][0]+page_coord[2] )/self.scale_x) )
-                        points_co=points_co+','
-                        points_co=points_co+str( int((found_polygons_marginals[mm][lmm][0][1] +page_coord[0])/self.scale_y) )
-    
-                    if lmm<(len(found_polygons_marginals[mm])-1):
-                        points_co=points_co+' '
-                #print(points_co)
-                coord_text.set('points',points_co)
-                
-                
+                coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals, mm, lmm, page_coord)
+
                 for j in range(len(all_found_texline_polygons_marginals[mm])):
     
                     textline=ET.SubElement(textregion, 'TextLine')
@@ -1743,27 +1710,8 @@ class eynollah:
 
                 textregion.set('id','r'+str(id_indexer))
                 id_indexer+=1
-
-
                 coord_text = ET.SubElement(textregion, 'Coords')
-
-                points_co=''
-                for lmm in range(len(found_polygons_text_region_img[mm])):
-
-                    if len(found_polygons_text_region_img[mm][lmm])==2:
-                        points_co=points_co+str( int( (found_polygons_text_region_img[mm][lmm][0] +page_coord[2])/self.scale_x ) )
-                        points_co=points_co+','
-                        points_co=points_co+str( int( (found_polygons_text_region_img[mm][lmm][1] +page_coord[0])/self.scale_y ) )
-                    else:
-                        points_co=points_co+str( int((found_polygons_text_region_img[mm][lmm][0][0] +page_coord[2])/self.scale_x) )
-                        points_co=points_co+','
-                        points_co=points_co+str( int((found_polygons_text_region_img[mm][lmm][0][1] +page_coord[0])/self.scale_y) )
-
-                    if lmm<(len(found_polygons_text_region_img[mm])-1):
-                        points_co=points_co+' '
-                    
-                    
-                coord_text.set('points',points_co)
+                coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_img, mm, lmm, page_coord)
         except:
             pass
 
@@ -1774,27 +1722,8 @@ class eynollah:
 
                 textregion.set('id','r'+str(id_indexer))
                 id_indexer+=1
-
-
                 coord_text = ET.SubElement(textregion, 'Coords')
-
-                points_co=''
-                for lmm in range(len(found_polygons_tables[mm])):
-
-                    if len(found_polygons_tables[mm][lmm])==2:
-                        points_co=points_co+str( int( (found_polygons_tables[mm][lmm][0] +page_coord[2])/self.scale_x ) )
-                        points_co=points_co+','
-                        points_co=points_co+str( int( (found_polygons_tables[mm][lmm][1] +page_coord[0])/self.scale_y ) )
-                    else:
-                        points_co=points_co+str( int((found_polygons_tables[mm][lmm][0][0] +page_coord[2])/self.scale_x) )
-                        points_co=points_co+','
-                        points_co=points_co+str( int((found_polygons_tables[mm][lmm][0][1] +page_coord[0])/self.scale_y) )
-
-                    if lmm<(len(found_polygons_tables[mm])-1):
-                        points_co=points_co+' '
-                    
-                    
-                coord_text.set('points',points_co)
+                coord_text.set('points', self.calculate_polygon_coords(found_polygons_tables, mm, lmm, page_coord)
         except:
             pass
 
@@ -1877,22 +1806,7 @@ class eynollah:
                 #else:
                 #    textregion.set('type','paragraph')
                 coord_text = ET.SubElement(textregion, 'Coords')
-                
-                points_co = ''
-                for lmm in range(len(found_polygons_text_region[mm])):
-                    if len(found_polygons_text_region[mm][lmm]) == 2:
-                        points_co = points_co + str( int( (found_polygons_text_region[mm][lmm][0] + page_coord[2] ) / self.scale_x ) )
-                        points_co = points_co + ','
-                        points_co = points_co + str( int( (found_polygons_text_region[mm][lmm][1] + page_coord[0] ) / self.scale_y ) )
-                    else:
-                        points_co = points_co + str( int( (found_polygons_text_region[mm][lmm][0][0] + page_coord[2] ) / self.scale_x ) )
-                        points_co = points_co + ','
-                        points_co = points_co + str( int( (found_polygons_text_region[mm][lmm][0][1] + page_coord[0] ) /self.scale_y) )
-    
-                    if lmm < (len(found_polygons_text_region[mm]) - 1):
-                        points_co = points_co + ' '
-                #print(points_co)
-                coord_text.set('points', points_co)
+                coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region, mm, lmm, page_coord))
                 
                 
                 
@@ -1981,15 +1895,13 @@ class eynollah:
                                 points_co = points_co + ','
                                 points_co = points_co+str( int( ( all_found_texline_polygons[mm][j][l][0][1] 
                                                         + all_box_coord[mm][0] + page_coord[0]) / self.scale_y) )
-                            
-    
+
                         if l < (len(all_found_texline_polygons[mm][j]) - 1):
                             points_co = points_co + ' '
                     #print(points_co)
                     coord.set('points', points_co)
-                    
+
                 texteqreg = ET.SubElement(textregion, 'TextEquiv')
-    
                 unireg = ET.SubElement(texteqreg, 'Unicode')
                 unireg.text = ' ' 
                 
@@ -2013,22 +1925,7 @@ class eynollah:
                 #else:
                 #    textregion.set('type','paragraph')
                 coord_text = ET.SubElement(textregion, 'Coords')
-                
-                points_co = ''
-                for lmm in range(len(found_polygons_marginals[mm])):
-                    if len(found_polygons_marginals[mm][lmm])==2:
-                        points_co=points_co+str( int( (found_polygons_marginals[mm][lmm][0]+page_coord[2] )/self.scale_x ) )
-                        points_co=points_co+','
-                        points_co=points_co+str( int( (found_polygons_marginals[mm][lmm][1]+page_coord[0] )/self.scale_y ) )
-                    else:
-                        points_co=points_co+str( int((found_polygons_marginals[mm][lmm][0][0]+page_coord[2] )/self.scale_x) )
-                        points_co=points_co+','
-                        points_co=points_co+str( int((found_polygons_marginals[mm][lmm][0][1] +page_coord[0])/self.scale_y) )
-    
-                    if lmm<(len(found_polygons_marginals[mm])-1):
-                        points_co=points_co+' '
-                #print(points_co)
-                coord_text.set('points',points_co)
+                coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals, mm, lmm, page_coord)
                 
                 for j in range(len(all_found_texline_polygons_marginals[mm])):
     

From ae1d335010ac97684de15d4229151c6e60f72567 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Thu, 28 Jan 2021 19:11:28 +0100
Subject: [PATCH 09/89] :art: remove extraneous empty lines, simplify elif to
 else where possible

---
 sbb_newspapers_org_image/eynollah.py | 263 +++------------------------
 1 file changed, 28 insertions(+), 235 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index 9a04b69..dc1d6cd 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -207,18 +207,17 @@ class eynollah:
 
             for i in range(nxf):
                 for j in range(nyf):
-
                     if i == 0:
                         index_x_d = i * width_mid
                         index_x_u = index_x_d + img_width_model
-                    elif i > 0:
+                    else:
                         index_x_d = i * width_mid
                         index_x_u = index_x_d + img_width_model
 
                     if j == 0:
                         index_y_d = j * height_mid
                         index_y_u = index_y_d + img_height_model
-                    elif j > 0:
+                    else:
                         index_y_d = j * height_mid
                         index_y_u = index_y_d + img_height_model
 
@@ -230,7 +229,6 @@ class eynollah:
                         index_y_d = img_h - img_height_model
 
                     img_patch = img[index_y_d:index_y_u, index_x_d:index_x_u, :]
-
                     label_p_pred = model_enhancement.predict(img_patch.reshape(1, img_patch.shape[0], img_patch.shape[1], img_patch.shape[2]))
 
                     seg = label_p_pred[0, :, :, :]
@@ -239,43 +237,29 @@ class eynollah:
                     if i == 0 and j == 0:
                         seg = seg[0 : seg.shape[0] - margin, 0 : seg.shape[1] - margin]
                         prediction_true[index_y_d + 0 : index_y_u - margin, index_x_d + 0 : index_x_u - margin, :] = seg
-
                     elif i == nxf - 1 and j == nyf - 1:
                         seg = seg[margin : seg.shape[0] - 0, margin : seg.shape[1] - 0]
                         prediction_true[index_y_d + margin : index_y_u - 0, index_x_d + margin : index_x_u - 0, :] = seg
-
                     elif i == 0 and j == nyf - 1:
                         seg = seg[margin : seg.shape[0] - 0, 0 : seg.shape[1] - margin]
-
                         prediction_true[index_y_d + margin : index_y_u - 0, index_x_d + 0 : index_x_u - margin, :] = seg
-
                     elif i == nxf - 1 and j == 0:
                         seg = seg[0 : seg.shape[0] - margin, margin : seg.shape[1] - 0]
-
                         prediction_true[index_y_d + 0 : index_y_u - margin, index_x_d + margin : index_x_u - 0, :] = seg
-
                     elif i == 0 and j != 0 and j != nyf - 1:
                         seg = seg[margin : seg.shape[0] - margin, 0 : seg.shape[1] - margin]
-
                         prediction_true[index_y_d + margin : index_y_u - margin, index_x_d + 0 : index_x_u - margin, :] = seg
-
                     elif i == nxf - 1 and j != 0 and j != nyf - 1:
                         seg = seg[margin : seg.shape[0] - margin, margin : seg.shape[1] - 0]
-
                         prediction_true[index_y_d + margin : index_y_u - margin, index_x_d + margin : index_x_u - 0, :] = seg
-
                     elif i != 0 and i != nxf - 1 and j == 0:
                         seg = seg[0 : seg.shape[0] - margin, margin : seg.shape[1] - margin]
                         prediction_true[index_y_d + 0 : index_y_u - margin, index_x_d + margin : index_x_u - margin, :] = seg
-
                     elif i != 0 and i != nxf - 1 and j == nyf - 1:
                         seg = seg[margin : seg.shape[0] - 0, margin : seg.shape[1] - margin]
-
                         prediction_true[index_y_d + margin : index_y_u - 0, index_x_d + margin : index_x_u - margin, :] = seg
-
                     else:
                         seg = seg[margin : seg.shape[0] - margin, margin : seg.shape[1] - margin]
-
                         prediction_true[index_y_d + margin : index_y_u - margin, index_x_d + margin : index_x_u - margin, :] = seg
 
             prediction_true = prediction_true.astype(int)
@@ -297,9 +281,7 @@ class eynollah:
         model_num_classifier, session_col_classifier = self.start_new_session_and_model(self.model_dir_of_col_classifier)
 
         img_1ch = cv2.imread(self.image_filename, 0)
-
         width_early = img_1ch.shape[1]
-
         img_1ch = img_1ch[page_coord[0] : page_coord[1], page_coord[2] : page_coord[3]]
 
         # plt.imshow(img_1ch)
@@ -329,66 +311,51 @@ class eynollah:
         if num_col == 1 and width_early < 1100:
             img_w_new = 2000
             img_h_new = int(img.shape[0] / float(img.shape[1]) * 2000)
-
         elif num_col == 1 and width_early >= 2500:
             img_w_new = 2000
             img_h_new = int(img.shape[0] / float(img.shape[1]) * 2000)
         elif num_col == 1 and width_early >= 1100 and width_early < 2500:
             img_w_new = width_early
             img_h_new = int(img.shape[0] / float(img.shape[1]) * width_early)
-
         elif num_col == 2 and width_early < 2000:
             img_w_new = 2400
             img_h_new = int(img.shape[0] / float(img.shape[1]) * 2400)
-
         elif num_col == 2 and width_early >= 3500:
             img_w_new = 2400
             img_h_new = int(img.shape[0] / float(img.shape[1]) * 2400)
-
         elif num_col == 2 and width_early >= 2000 and width_early < 3500:
             img_w_new = width_early
             img_h_new = int(img.shape[0] / float(img.shape[1]) * width_early)
-
         elif num_col == 3 and width_early < 2000:
             img_w_new = 3000
             img_h_new = int(img.shape[0] / float(img.shape[1]) * 3000)
-
         elif num_col == 3 and width_early >= 4000:
             img_w_new = 3000
             img_h_new = int(img.shape[0] / float(img.shape[1]) * 3000)
-
         elif num_col == 3 and width_early >= 2000 and width_early < 4000:
             img_w_new = width_early
             img_h_new = int(img.shape[0] / float(img.shape[1]) * width_early)
-
         elif num_col == 4 and width_early < 2500:
             img_w_new = 4000
             img_h_new = int(img.shape[0] / float(img.shape[1]) * 4000)
-
         elif num_col == 4 and width_early >= 5000:
             img_w_new = 4000
             img_h_new = int(img.shape[0] / float(img.shape[1]) * 4000)
-
         elif num_col == 4 and width_early >= 2500 and width_early < 5000:
             img_w_new = width_early
             img_h_new = int(img.shape[0] / float(img.shape[1]) * width_early)
-
         elif num_col == 5 and width_early < 3700:
             img_w_new = 5000
             img_h_new = int(img.shape[0] / float(img.shape[1]) * 5000)
-
         elif num_col == 5 and width_early >= 7000:
             img_w_new = 5000
             img_h_new = int(img.shape[0] / float(img.shape[1]) * 5000)
-
         elif num_col == 5 and width_early >= 3700 and width_early < 7000:
             img_w_new = width_early
             img_h_new = int(img.shape[0] / float(img.shape[1]) * width_early)
-
         elif num_col == 6 and width_early < 4500:
             img_w_new = 6500  # 5400
             img_h_new = int(img.shape[0] / float(img.shape[1]) * 6500)
-
         else:
             img_w_new = width_early
             img_h_new = int(img.shape[0] / float(img.shape[1]) * width_early)
@@ -459,66 +426,51 @@ class eynollah:
             if num_col == 1 and width_early < 1100:
                 img_w_new = 2000
                 img_h_new = int(img.shape[0] / float(img.shape[1]) * 2000)
-
             elif num_col == 1 and width_early >= 2500:
                 img_w_new = 2000
                 img_h_new = int(img.shape[0] / float(img.shape[1]) * 2000)
             elif num_col == 1 and width_early >= 1100 and width_early < 2500:
                 img_w_new = width_early
                 img_h_new = int(img.shape[0] / float(img.shape[1]) * width_early)
-
             elif num_col == 2 and width_early < 2000:
                 img_w_new = 2400
                 img_h_new = int(img.shape[0] / float(img.shape[1]) * 2400)
-
             elif num_col == 2 and width_early >= 3500:
                 img_w_new = 2400
                 img_h_new = int(img.shape[0] / float(img.shape[1]) * 2400)
-
             elif num_col == 2 and width_early >= 2000 and width_early < 3500:
                 img_w_new = width_early
                 img_h_new = int(img.shape[0] / float(img.shape[1]) * width_early)
-
             elif num_col == 3 and width_early < 2000:
                 img_w_new = 3000
                 img_h_new = int(img.shape[0] / float(img.shape[1]) * 3000)
-
             elif num_col == 3 and width_early >= 4000:
                 img_w_new = 3000
                 img_h_new = int(img.shape[0] / float(img.shape[1]) * 3000)
-
             elif num_col == 3 and width_early >= 2000 and width_early < 4000:
                 img_w_new = width_early
                 img_h_new = int(img.shape[0] / float(img.shape[1]) * width_early)
-
             elif num_col == 4 and width_early < 2500:
                 img_w_new = 4000
                 img_h_new = int(img.shape[0] / float(img.shape[1]) * 4000)
-
             elif num_col == 4 and width_early >= 5000:
                 img_w_new = 4000
                 img_h_new = int(img.shape[0] / float(img.shape[1]) * 4000)
-
             elif num_col == 4 and width_early >= 2500 and width_early < 5000:
                 img_w_new = width_early
                 img_h_new = int(img.shape[0] / float(img.shape[1]) * width_early)
-
             elif num_col == 5 and width_early < 3700:
                 img_w_new = 5000
                 img_h_new = int(img.shape[0] / float(img.shape[1]) * 5000)
-
             elif num_col == 5 and width_early >= 7000:
                 img_w_new = 5000
                 img_h_new = int(img.shape[0] / float(img.shape[1]) * 5000)
-
             elif num_col == 5 and width_early >= 3700 and width_early < 7000:
                 img_w_new = width_early
                 img_h_new = int(img.shape[0] / float(img.shape[1]) * width_early)
-
             elif num_col == 6 and width_early < 4500:
                 img_w_new = 6500  # 5400
                 img_h_new = int(img.shape[0] / float(img.shape[1]) * 6500)
-
             else:
                 img_w_new = width_early
                 img_h_new = int(img.shape[0] / float(img.shape[1]) * width_early)
@@ -626,14 +578,14 @@ class eynollah:
                     if i == 0:
                         index_x_d = i * width_mid
                         index_x_u = index_x_d + img_width_model
-                    elif i > 0:
+                    else:
                         index_x_d = i * width_mid
                         index_x_u = index_x_d + img_width_model
 
                     if j == 0:
                         index_y_d = j * height_mid
                         index_y_u = index_y_d + img_height_model
-                    elif j > 0:
+                    else:
                         index_y_d = j * height_mid
                         index_y_u = index_y_d + img_height_model
 
@@ -652,63 +604,46 @@ class eynollah:
                     if i == 0 and j == 0:
                         seg_color = seg_color[0 : seg_color.shape[0] - margin, 0 : seg_color.shape[1] - margin, :]
                         seg = seg[0 : seg.shape[0] - margin, 0 : seg.shape[1] - margin]
-
                         mask_true[index_y_d + 0 : index_y_u - margin, index_x_d + 0 : index_x_u - margin] = seg
                         prediction_true[index_y_d + 0 : index_y_u - margin, index_x_d + 0 : index_x_u - margin, :] = seg_color
-
                     elif i == nxf - 1 and j == nyf - 1:
                         seg_color = seg_color[margin : seg_color.shape[0] - 0, margin : seg_color.shape[1] - 0, :]
                         seg = seg[margin : seg.shape[0] - 0, margin : seg.shape[1] - 0]
-
                         mask_true[index_y_d + margin : index_y_u - 0, index_x_d + margin : index_x_u - 0] = seg
                         prediction_true[index_y_d + margin : index_y_u - 0, index_x_d + margin : index_x_u - 0, :] = seg_color
-
                     elif i == 0 and j == nyf - 1:
                         seg_color = seg_color[margin : seg_color.shape[0] - 0, 0 : seg_color.shape[1] - margin, :]
                         seg = seg[margin : seg.shape[0] - 0, 0 : seg.shape[1] - margin]
-
                         mask_true[index_y_d + margin : index_y_u - 0, index_x_d + 0 : index_x_u - margin] = seg
                         prediction_true[index_y_d + margin : index_y_u - 0, index_x_d + 0 : index_x_u - margin, :] = seg_color
-
                     elif i == nxf - 1 and j == 0:
                         seg_color = seg_color[0 : seg_color.shape[0] - margin, margin : seg_color.shape[1] - 0, :]
                         seg = seg[0 : seg.shape[0] - margin, margin : seg.shape[1] - 0]
-
                         mask_true[index_y_d + 0 : index_y_u - margin, index_x_d + margin : index_x_u - 0] = seg
                         prediction_true[index_y_d + 0 : index_y_u - margin, index_x_d + margin : index_x_u - 0, :] = seg_color
-
                     elif i == 0 and j != 0 and j != nyf - 1:
                         seg_color = seg_color[margin : seg_color.shape[0] - margin, 0 : seg_color.shape[1] - margin, :]
                         seg = seg[margin : seg.shape[0] - margin, 0 : seg.shape[1] - margin]
-
                         mask_true[index_y_d + margin : index_y_u - margin, index_x_d + 0 : index_x_u - margin] = seg
                         prediction_true[index_y_d + margin : index_y_u - margin, index_x_d + 0 : index_x_u - margin, :] = seg_color
-
                     elif i == nxf - 1 and j != 0 and j != nyf - 1:
                         seg_color = seg_color[margin : seg_color.shape[0] - margin, margin : seg_color.shape[1] - 0, :]
                         seg = seg[margin : seg.shape[0] - margin, margin : seg.shape[1] - 0]
-
                         mask_true[index_y_d + margin : index_y_u - margin, index_x_d + margin : index_x_u - 0] = seg
                         prediction_true[index_y_d + margin : index_y_u - margin, index_x_d + margin : index_x_u - 0, :] = seg_color
-
                     elif i != 0 and i != nxf - 1 and j == 0:
                         seg_color = seg_color[0 : seg_color.shape[0] - margin, margin : seg_color.shape[1] - margin, :]
                         seg = seg[0 : seg.shape[0] - margin, margin : seg.shape[1] - margin]
-
                         mask_true[index_y_d + 0 : index_y_u - margin, index_x_d + margin : index_x_u - margin] = seg
                         prediction_true[index_y_d + 0 : index_y_u - margin, index_x_d + margin : index_x_u - margin, :] = seg_color
-
                     elif i != 0 and i != nxf - 1 and j == nyf - 1:
                         seg_color = seg_color[margin : seg_color.shape[0] - 0, margin : seg_color.shape[1] - margin, :]
                         seg = seg[margin : seg.shape[0] - 0, margin : seg.shape[1] - margin]
-
                         mask_true[index_y_d + margin : index_y_u - 0, index_x_d + margin : index_x_u - margin] = seg
                         prediction_true[index_y_d + margin : index_y_u - 0, index_x_d + margin : index_x_u - margin, :] = seg_color
-
                     else:
                         seg_color = seg_color[margin : seg_color.shape[0] - margin, margin : seg_color.shape[1] - margin, :]
                         seg = seg[margin : seg.shape[0] - margin, margin : seg.shape[1] - margin]
-
                         mask_true[index_y_d + margin : index_y_u - margin, index_x_d + margin : index_x_u - margin] = seg
                         prediction_true[index_y_d + margin : index_y_u - margin, index_x_d + margin : index_x_u - margin, :] = seg_color
 
@@ -753,20 +688,13 @@ class eynollah:
 
         imgray = cv2.cvtColor(img_page_prediction, cv2.COLOR_BGR2GRAY)
         _, thresh = cv2.threshold(imgray, 0, 255, 0)
-
         thresh = cv2.dilate(thresh, self.kernel, iterations=3)
         contours, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
-
         cnt_size = np.array([cv2.contourArea(contours[j]) for j in range(len(contours))])
-
         cnt = contours[np.argmax(cnt_size)]
-
         x, y, w, h = cv2.boundingRect(cnt)
-
         box = [x, y, w, h]
-
         croped_page, page_coord = crop_image_inside_box(box, img)
-
         session_page.close()
         del model_page
         del session_page
@@ -801,9 +729,7 @@ class eynollah:
         contours, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
 
         cnt_size = np.array([cv2.contourArea(contours[j]) for j in range(len(contours))])
-
         cnt = contours[np.argmax(cnt_size)]
-
         x, y, w, h = cv2.boundingRect(cnt)
 
         if x <= 30:
@@ -811,7 +737,6 @@ class eynollah:
             x = 0
         if (self.image.shape[1] - (x + w)) <= 30:
             w = w + (self.image.shape[1] - (x + w))
-
         if y <= 30:
             h = h + y
             y = 0
@@ -819,7 +744,6 @@ class eynollah:
             h = h + (self.image.shape[0] - (y + h))
 
         box = [x, y, w, h]
-
         croped_page, page_coord = crop_image_inside_box(box, self.image)
 
         self.cont_page.append(np.array([[page_coord[2], page_coord[0]], [page_coord[3], page_coord[0]], [page_coord[3], page_coord[1]], [page_coord[2], page_coord[1]]]))
@@ -1811,63 +1735,45 @@ class eynollah:
                 
                 
                 for j in range(len(all_found_texline_polygons[mm])):
-    
                     textline=ET.SubElement(textregion, 'TextLine')
-                    
                     textline.set('id', 'l' + str(id_indexer_l))
-                    
                     id_indexer_l += 1
-                    
-    
                     coord = ET.SubElement(textline, 'Coords')
-    
                     texteq=ET.SubElement(textline, 'TextEquiv')
-    
                     uni=ET.SubElement(texteq, 'Unicode')
                     uni.text = ' ' 
-    
                     #points = ET.SubElement(coord, 'Points') 
-    
                     points_co=''
                     for l in range(len(all_found_texline_polygons[mm][j])):
                         #point = ET.SubElement(coord, 'Point') 
-    
-    
                         if not curved_line:
                             #point.set('x',str(found_polygons[j][l][0]))  
                             #point.set('y',str(found_polygons[j][l][1]))
                             if len(all_found_texline_polygons[mm][j][l]) == 2:
-                                
                                 textline_x_coord = int( (all_found_texline_polygons[mm][j][l][0]
                                                         + all_box_coord[mm][2] + page_coord[2]) / self.scale_x)
                                 textline_y_coord=int( (all_found_texline_polygons[mm][j][l][1] 
                                                         + all_box_coord[mm][0] + page_coord[0]) / self.scale_y)
-                                
                                 if textline_x_coord < 0:
                                     textline_x_coord = 0
                                 if textline_y_coord < 0:
                                     textline_y_coord = 0
-                                    
                                 points_co = points_co + str( textline_x_coord )
                                 points_co = points_co + ','
                                 points_co = points_co + str( textline_y_coord )
                             else:
-                                
                                 textline_x_coord = int( ( all_found_texline_polygons[mm][j][l][0][0] 
                                                         + all_box_coord[mm][2]+page_coord[2])/self.scale_x )
-                                
                                 textline_y_coord=int( ( all_found_texline_polygons[mm][j][l][0][1] 
                                                         +all_box_coord[mm][0]+page_coord[0])/self.scale_y)
-                                
                                 if textline_x_coord < 0:
                                     textline_x_coord = 0
                                 if textline_y_coord < 0:
                                     textline_y_coord = 0
-                                    
                                 points_co = points_co + str( textline_x_coord )
                                 points_co = points_co + ','
                                 points_co = points_co + str( textline_y_coord ) 
-                                                
+
                         if (self.curved_line) and abs(slopes[mm]) <= 45:
                             if len(all_found_texline_polygons[mm][j][l]) == 2:
                                 points_co=points_co + str( int( (all_found_texline_polygons[mm][j][l][0]
@@ -1904,11 +1810,8 @@ class eynollah:
                 texteqreg = ET.SubElement(textregion, 'TextEquiv')
                 unireg = ET.SubElement(texteqreg, 'Unicode')
                 unireg.text = ' ' 
-                
-
         try:
             #id_indexer_l=0
-            
             try:
                 id_indexer_l = id_indexer_l
             except:
@@ -1916,40 +1819,21 @@ class eynollah:
     
             for mm in range(len(found_polygons_marginals)):
                 textregion = ET.SubElement(page, 'TextRegion')
-    
                 textregion.set('id', id_of_marginalia[mm])
-                
                 textregion.set('type', 'marginalia')
-                #if mm==0:
-                #    textregion.set('type','header')
-                #else:
-                #    textregion.set('type','paragraph')
                 coord_text = ET.SubElement(textregion, 'Coords')
                 coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals, mm, lmm, page_coord)
-                
                 for j in range(len(all_found_texline_polygons_marginals[mm])):
-    
                     textline=ET.SubElement(textregion, 'TextLine')
-                    
                     textline.set('id','l'+str(id_indexer_l))
-                    
                     id_indexer_l+=1
-                    
-    
                     coord = ET.SubElement(textline, 'Coords')
-    
-                    texteq=ET.SubElement(textline, 'TextEquiv')
-    
-                    uni=ET.SubElement(texteq, 'Unicode')
+                    texteq = ET.SubElement(textline, 'TextEquiv')
+                    uni = ET.SubElement(texteq, 'Unicode')
                     uni.text = ' ' 
-    
                     #points = ET.SubElement(coord, 'Points') 
-    
                     points_co=''
                     for l in range(len(all_found_texline_polygons_marginals[mm][j])):
-                        #point = ET.SubElement(coord, 'Point') 
-    
-    
                         if not curved_line:
                             #point.set('x',str(found_polygons[j][l][0]))  
                             #point.set('y',str(found_polygons[j][l][1]))
@@ -1965,8 +1849,7 @@ class eynollah:
                                 points_co=points_co+','
                                 points_co=points_co+str( int( ( all_found_texline_polygons_marginals[mm][j][l][0][1] 
                                                         +all_box_coord_marginals[mm][0]+page_coord[0])/self.scale_y) ) 
-                                                
-                        if curved_line:
+                        else:
                             if len(all_found_texline_polygons_marginals[mm][j][l])==2:
                                 points_co=points_co+str( int( (all_found_texline_polygons_marginals[mm][j][l][0]
                                                         +page_coord[2])/self.scale_x) )
@@ -1979,7 +1862,6 @@ class eynollah:
                                 points_co=points_co+','
                                 points_co=points_co+str( int( ( all_found_texline_polygons_marginals[mm][j][l][0][1] 
                                                         +page_coord[0])/self.scale_y) ) 
-    
                         if l<(len(all_found_texline_polygons_marginals[mm][j])-1):
                             points_co=points_co+' '
                     #print(points_co)
@@ -2005,8 +1887,6 @@ class eynollah:
 
                     if lmm<(len(found_polygons_text_region_img[mm])-1):
                         points_co=points_co+' '
-                    
-                    
                 coord_text.set('points',points_co)
         except:
             pass
@@ -2019,75 +1899,58 @@ class eynollah:
         # cv2.imwrite(os.path.join(dir_of_image, self.image_filename_stem) + ".tif",self.image_org)
 
     def get_regions_from_xy_2models(self,img,is_image_enhanced):
-        img_org=np.copy(img)
-
-        img_height_h=img_org.shape[0]
-        img_width_h=img_org.shape[1]
+        img_org = np.copy(img)
+        img_height_h = img_org.shape[0]
+        img_width_h = img_org.shape[1]
 
         model_region, session_region = self.start_new_session_and_model(self.model_region_dir_p_ens)
 
         gaussian_filter=False
         patches=True
         binary=False
-
-
-
-
-
         ratio_y=1.3
         ratio_x=1
-
         median_blur=False
 
-        img= resize_image(img_org, int(img_org.shape[0]*ratio_y), int(img_org.shape[1]*ratio_x))
+        img = resize_image(img_org, int(img_org.shape[0]*ratio_y), int(img_org.shape[1]*ratio_x))
 
         if binary:
-            img = otsu_copy_binary(img)#self.otsu_copy(img)
+            img = otsu_copy_binary(img)
             img = img.astype(np.uint16)
-
         if median_blur:
-            img=cv2.medianBlur(img,5)
+            img = cv2.medianBlur(img,5)
         if gaussian_filter:
             img= cv2.GaussianBlur(img,(5,5),0)
             img = img.astype(np.uint16)
-        prediction_regions_org_y=self.do_prediction(patches,img,model_region)
 
+        prediction_regions_org_y = self.do_prediction(patches,img,model_region)
         prediction_regions_org_y = resize_image(prediction_regions_org_y, img_height_h, img_width_h )
 
         #plt.imshow(prediction_regions_org_y[:,:,0])
         #plt.show()
         #sys.exit()
         prediction_regions_org_y=prediction_regions_org_y[:,:,0]
-
-
         mask_zeros_y=(prediction_regions_org_y[:,:]==0)*1
-
-
-
-
-
         if is_image_enhanced:
-            ratio_x=1.2
+            ratio_x = 1.2
         else:
-            ratio_x=1
-
-        ratio_y=1
+            ratio_x = 1
+        ratio_y = 1
         median_blur=False
 
-        img= resize_image(img_org, int(img_org.shape[0]*ratio_y), int(img_org.shape[1]*ratio_x))
+        img = resize_image(img_org, int(img_org.shape[0]*ratio_y), int(img_org.shape[1]*ratio_x))
 
         if binary:
             img = otsu_copy_binary(img)#self.otsu_copy(img)
             img = img.astype(np.uint16)
-
         if median_blur:
-            img=cv2.medianBlur(img,5)
+            img = cv2.medianBlur(img, 5)
         if gaussian_filter:
-            img= cv2.GaussianBlur(img,(5,5),0)
+            img = cv2.GaussianBlur(img, (5,5 ), 0)
             img = img.astype(np.uint16)
-        prediction_regions_org=self.do_prediction(patches,img,model_region)
 
-        prediction_regions_org=resize_image(prediction_regions_org, img_height_h, img_width_h )
+        prediction_regions_org = self.do_prediction(patches,img,model_region)
+        prediction_regions_org = resize_image(prediction_regions_org, img_height_h, img_width_h )
 
         ##plt.imshow(prediction_regions_org[:,:,0])
         ##plt.show()
@@ -2105,10 +1968,6 @@ class eynollah:
         gaussian_filter=False
         patches=True
         binary=False
-
-
-
-
         ratio_x=1
         ratio_y=1
         median_blur=False
@@ -2626,17 +2485,13 @@ class eynollah:
         img_g = img_g.astype(np.uint8)
 
         img_g3 = np.zeros((img_g.shape[0], img_g.shape[1], 3))
-
         img_g3 = img_g3.astype(np.uint8)
-
         img_g3[:, :, 0] = img_g[:, :]
         img_g3[:, :, 1] = img_g[:, :]
         img_g3[:, :, 2] = img_g[:, :]
 
         image_page, page_coord = self.extract_page()
-
         # print(image_page.shape,'page')
-
         if self.dir_of_all is not None:
             cv2.imwrite(os.path.join(self.dir_of_all, self.image_filename_stem + "_page.png"), image_page)
         K.clear_session()
@@ -2649,13 +2504,12 @@ class eynollah:
         text_regions_p_1 = text_regions_p_1[page_coord[0] : page_coord[1], page_coord[2] : page_coord[3]]
 
         mask_images = (text_regions_p_1[:, :] == 2) * 1
-        mask_lines = (text_regions_p_1[:, :] == 3) * 1
-
         mask_images = mask_images.astype(np.uint8)
-        mask_lines = mask_lines.astype(np.uint8)
-
         mask_images = cv2.erode(mask_images[:, :], self.kernel, iterations=10)
 
+        mask_lines = (text_regions_p_1[:, :] == 3) * 1
+        mask_lines = mask_lines.astype(np.uint8)
+
         img_only_regions_with_sep = ((text_regions_p_1[:, :] != 3) & (text_regions_p_1[:, :] != 0)) * 1
         img_only_regions_with_sep = img_only_regions_with_sep.astype(np.uint8)
         img_only_regions = cv2.erode(img_only_regions_with_sep[:, :], self.kernel, iterations=6)
@@ -2692,11 +2546,8 @@ class eynollah:
 
                 K.clear_session()
                 gc.collect()
-
                 #print(np.unique(textline_mask_tot_ea[:, :]), "textline")
-
                 if self.dir_of_all is not None:
-
                     values = np.unique(textline_mask_tot_ea[:, :])
                     pixels = ["Background", "Textlines"]
                     values_indexes = [0, 1]
@@ -2738,19 +2589,11 @@ class eynollah:
                 min_area = 0.00001
                 max_area = 0.0006
                 textline_mask_tot_small_size = return_contours_of_interested_region_by_size(textline_mask_tot, pixel_img, min_area, max_area)
-
-                # text_regions_p_1[(textline_mask_tot[:,:]==1) & (text_regions_p_1[:,:]==2)]=1
-
                 text_regions_p_1[mask_lines[:, :] == 1] = 3
-
-                ##text_regions_p_1[textline_mask_tot_small_size[:,:]==1]=1
-
                 text_regions_p = text_regions_p_1[:, :]  # long_short_region[:,:]#self.get_regions_from_2_models(image_page)
-
                 text_regions_p = np.array(text_regions_p)
 
                 if num_col_classifier == 1 or num_col_classifier == 2:
-
                     try:
                         regions_without_seperators = (text_regions_p[:, :] == 1) * 1
                         regions_without_seperators = regions_without_seperators.astype(np.uint8)
@@ -2759,8 +2602,6 @@ class eynollah:
 
                     except:
                         pass
-                else:
-                    pass
 
                 # plt.imshow(text_regions_p)
                 # plt.show()
@@ -2776,12 +2617,9 @@ class eynollah:
 
                     if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
                         image_page_rotated_n, textline_mask_tot_d, text_regions_p_1_n = rotation_not_90_func(image_page, textline_mask_tot, text_regions_p, slope_deskew)
-
                         text_regions_p_1_n = resize_image(text_regions_p_1_n, text_regions_p.shape[0], text_regions_p.shape[1])
                         textline_mask_tot_d = resize_image(textline_mask_tot_d, text_regions_p.shape[0], text_regions_p.shape[1])
-
                         regions_without_seperators_d = (text_regions_p_1_n[:, :] == 1) * 1
-
                     regions_without_seperators = (text_regions_p[:, :] == 1) * 1  # ( (text_regions_p[:,:]==1) | (text_regions_p[:,:]==2) )*1 #self.return_regions_without_seperators_new(text_regions_p[:,:,0],img_only_regions)
 
                     pixel_lines = 3
@@ -2794,31 +2632,24 @@ class eynollah:
                     gc.collect()
 
                     # print(peaks_neg_fin,num_col,'num_col2')
-
                     print(num_col_classifier, "num_col_classifier")
 
                     if num_col_classifier >= 3:
                         if np.abs(slope_deskew) < SLOPE_THRESHOLD:
                             regions_without_seperators = regions_without_seperators.astype(np.uint8)
                             regions_without_seperators = cv2.erode(regions_without_seperators[:, :], self.kernel, iterations=6)
-
                             #random_pixels_for_image = np.random.randn(regions_without_seperators.shape[0], regions_without_seperators.shape[1])
                             #random_pixels_for_image[random_pixels_for_image < -0.5] = 0
                             #random_pixels_for_image[random_pixels_for_image != 0] = 1
-
                             #regions_without_seperators[(random_pixels_for_image[:, :] == 1) & (text_regions_p[:, :] == 2)] = 1
-
-                        if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
+                        else:
                             regions_without_seperators_d = regions_without_seperators_d.astype(np.uint8)
                             regions_without_seperators_d = cv2.erode(regions_without_seperators_d[:, :], self.kernel, iterations=6)
-
                             #random_pixels_for_image = np.random.randn(regions_without_seperators_d.shape[0], regions_without_seperators_d.shape[1])
                             #random_pixels_for_image[random_pixels_for_image < -0.5] = 0
                             #random_pixels_for_image[random_pixels_for_image != 0] = 1
 
                             #regions_without_seperators_d[(random_pixels_for_image[:, :] == 1) & (text_regions_p_1_n[:, :] == 2)] = 1
-                    else:
-                        pass
 
                     if np.abs(slope_deskew) < SLOPE_THRESHOLD:
                         boxes = return_boxes_of_images_by_order_of_reading_new(spliter_y_new, regions_without_seperators, matrix_of_lines_ch, num_col_classifier)
@@ -2826,13 +2657,9 @@ class eynollah:
                         boxes_d = return_boxes_of_images_by_order_of_reading_new(spliter_y_new_d, regions_without_seperators_d, matrix_of_lines_ch_d, num_col_classifier)
 
                     # print(len(boxes),'boxes')
-
                     # sys.exit()
-
                     print("boxes in: " + str(time.time() - t1))
                     img_revised_tab = text_regions_p[:, :]
-                    
-                    
                     pixel_img = 2
                     polygons_of_images = return_contours_of_interested_region(img_revised_tab, pixel_img)
 
@@ -2852,14 +2679,11 @@ class eynollah:
 
                     K.clear_session()
                     # gc.collect()
-
                     patches = True
-
                     image_page = image_page.astype(np.uint8)
 
                     # print(type(image_page))
                     regions_fully, regions_fully_only_drop = self.extract_text_regions(image_page, patches, cols=num_col_classifier)
-                    
                     text_regions_p[:,:][regions_fully[:,:,0]==6]=6
 
                     regions_fully_only_drop = put_drop_out_from_only_drop_model(regions_fully_only_drop, text_regions_p)
@@ -2903,7 +2727,6 @@ class eynollah:
                     # plt.show()
 
                     text_regions_p[:, :][regions_fully[:, :, 0] == 4] = 4
-
                     text_regions_p[:, :][regions_fully_np[:, :, 0] == 4] = 4
 
                     # plt.imshow(text_regions_p)
@@ -2915,18 +2738,14 @@ class eynollah:
                         text_regions_p_1_n = resize_image(text_regions_p_1_n, text_regions_p.shape[0], text_regions_p.shape[1])
                         textline_mask_tot_d = resize_image(textline_mask_tot_d, text_regions_p.shape[0], text_regions_p.shape[1])
                         regions_fully_n = resize_image(regions_fully_n, text_regions_p.shape[0], text_regions_p.shape[1])
-
                         regions_without_seperators_d = (text_regions_p_1_n[:, :] == 1) * 1
 
                     regions_without_seperators = (text_regions_p[:, :] == 1) * 1  # ( (text_regions_p[:,:]==1) | (text_regions_p[:,:]==2) )*1 #self.return_regions_without_seperators_new(text_regions_p[:,:,0],img_only_regions)
 
                     K.clear_session()
                     gc.collect()
-
                     img_revised_tab = np.copy(text_regions_p[:, :])
-
                     print("full layout in: " + str(time.time() - t1))
-
                     pixel_img = 5
                     polygons_of_images = return_contours_of_interested_region(img_revised_tab, pixel_img)
 
@@ -2950,16 +2769,11 @@ class eynollah:
                 # plt.show()
 
                 min_con_area = 0.000005
-
                 if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
-
                     contours_only_text, hir_on_text = return_contours_of_image(text_only)
                     contours_only_text_parent = return_parent_contours(contours_only_text, hir_on_text)
-
                     areas_cnt_text = np.array([cv2.contourArea(contours_only_text_parent[j]) for j in range(len(contours_only_text_parent))])
-
                     areas_cnt_text = areas_cnt_text / float(text_only.shape[0] * text_only.shape[1])
-
                     contours_biggest = contours_only_text_parent[np.argmax(areas_cnt_text)]
                     contours_only_text_parent = [contours_only_text_parent[jz] for jz in range(len(contours_only_text_parent)) if areas_cnt_text[jz] > min_con_area]
                     areas_cnt_text_parent = [areas_cnt_text[jz] for jz in range(len(areas_cnt_text)) if areas_cnt_text[jz] > min_con_area]
@@ -2975,26 +2789,20 @@ class eynollah:
                     contours_only_text_parent_d = return_parent_contours(contours_only_text_d, hir_on_text_d)
 
                     areas_cnt_text_d = np.array([cv2.contourArea(contours_only_text_parent_d[j]) for j in range(len(contours_only_text_parent_d))])
-
                     areas_cnt_text_d = areas_cnt_text_d / float(text_only_d.shape[0] * text_only_d.shape[1])
 
                     contours_biggest_d = contours_only_text_parent_d[np.argmax(areas_cnt_text_d)]
-                    
                     index_con_parents_d=np.argsort(areas_cnt_text_d)
                     contours_only_text_parent_d=list(np.array(contours_only_text_parent_d)[index_con_parents_d] )
                     areas_cnt_text_d=list(np.array(areas_cnt_text_d)[index_con_parents_d] )
 
                     cx_bigest_d_big, cy_biggest_d_big, _, _, _, _, _ = find_new_features_of_contoures([contours_biggest_d])
                     cx_bigest_d, cy_biggest_d, _, _, _, _, _ = find_new_features_of_contoures(contours_only_text_parent_d)
-                    
                     try:
                         cx_bigest_d_last5=cx_bigest_d[-5:]
                         cy_biggest_d_last5=cy_biggest_d[-5:]
-                        
                         dists_d = [math.sqrt((cx_bigest_big[0]-cx_bigest_d_last5[j])**2 + (cy_biggest_big[0]-cy_biggest_d_last5[j])**2) for j in range(len(cy_biggest_d_last5))]
-                        
                         ind_largest=len(cx_bigest_d)-5+np.argmin(dists_d)
-
                         cx_bigest_d_big[0]=cx_bigest_d[ind_largest]
                         cy_biggest_d_big[0]=cy_biggest_d[ind_largest]
                     except:
@@ -3032,18 +2840,15 @@ class eynollah:
                         dists = [math.sqrt((p[0] - cx_bigest_d[j]) ** 2 + (p[1] - cy_biggest_d[j]) ** 2) for j in range(len(cx_bigest_d))]
                         # print(np.argmin(dists))
                         contours_only_text_parent_d_ordered.append(contours_only_text_parent_d[np.argmin(dists)])
-
                         # img2=np.zeros((text_only.shape[0],text_only.shape[1],3))
                         # img2=cv2.fillPoly(img2,pts=[contours_only_text_parent_d[np.argmin(dists)]] ,color=(1,1,1))
                         # plt.imshow(img2[:,:,0])
                         # plt.show()
-
                 else:
                     contours_only_text, hir_on_text = return_contours_of_image(text_only)
                     contours_only_text_parent = return_parent_contours(contours_only_text, hir_on_text)
 
                     areas_cnt_text = np.array([cv2.contourArea(contours_only_text_parent[j]) for j in range(len(contours_only_text_parent))])
-
                     areas_cnt_text = areas_cnt_text / float(text_only.shape[0] * text_only.shape[1])
 
                     contours_biggest = contours_only_text_parent[np.argmax(areas_cnt_text)]
@@ -3061,42 +2866,32 @@ class eynollah:
                     # print(len(contours_only_text_parent),len(contours_only_text_parent_d),'vizzz')
 
                 txt_con_org = get_textregion_contours_in_org_image(contours_only_text_parent, self.image, slope_first)
-
                 boxes_text, _ = get_text_region_boxes_by_given_contours(contours_only_text_parent)
                 boxes_marginals, _ = get_text_region_boxes_by_given_contours(polygons_of_marginals)
 
                 if not self.curved_line:
                     slopes, all_found_texline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con = self.get_slopes_and_deskew_new(txt_con_org, contours_only_text_parent, textline_mask_tot_ea, image_page_rotated, boxes_text, slope_deskew)
-
                     slopes_marginals, all_found_texline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, index_by_text_par_con_marginal = self.get_slopes_and_deskew_new(polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea, image_page_rotated, boxes_marginals, slope_deskew)
 
-                if self.curved_line:
+                else:
                     scale_param = 1
                     all_found_texline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con, slopes = self.get_slopes_and_deskew_new_curved(txt_con_org, contours_only_text_parent, cv2.erode(textline_mask_tot_ea, kernel=self.kernel, iterations=1), image_page_rotated, boxes_text, text_only, num_col_classifier, scale_param, slope_deskew)
-
                     all_found_texline_polygons = small_textlines_to_parent_adherence2(all_found_texline_polygons, textline_mask_tot_ea, num_col_classifier)
-
                     all_found_texline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, index_by_text_par_con_marginal, slopes_marginals = self.get_slopes_and_deskew_new_curved(polygons_of_marginals, polygons_of_marginals, cv2.erode(textline_mask_tot_ea, kernel=self.kernel, iterations=1), image_page_rotated, boxes_marginals, text_only, num_col_classifier, scale_param, slope_deskew)
-
                     all_found_texline_polygons_marginals = small_textlines_to_parent_adherence2(all_found_texline_polygons_marginals, textline_mask_tot_ea, num_col_classifier)
-
                 index_of_vertical_text_contours = np.array(range(len(slopes)))[(abs(np.array(slopes)) > 60)]
-
                 contours_text_vertical = [contours_only_text_parent[i] for i in index_of_vertical_text_contours]
 
                 K.clear_session()
                 gc.collect()
-
                 # print(index_by_text_par_con,'index_by_text_par_con')
 
                 if self.full_layout:
                     if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
                         contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered)[index_by_text_par_con])
-
                         text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_texline_polygons, all_found_texline_polygons_h, slopes, slopes_h, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_texline_polygons, slopes, contours_only_text_parent_d_ordered)
                     else:
                         contours_only_text_parent_d_ordered = None
-
                         text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_texline_polygons, all_found_texline_polygons_h, slopes, slopes_h, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_texline_polygons, slopes, contours_only_text_parent_d_ordered)
 
                     if self.dir_of_layout is not None:
@@ -3110,10 +2905,8 @@ class eynollah:
                     ##print('Job done in: '+str(time.time()-t1))
 
                     polygons_of_tabels = []
-
                     pixel_img = 4
                     polygons_of_drop_capitals = return_contours_of_interested_region_by_min_size(text_regions_p, pixel_img)
-
                     all_found_texline_polygons = adhere_drop_capital_region_into_cprresponding_textline(text_regions_p, polygons_of_drop_capitals, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_texline_polygons, all_found_texline_polygons_h, kernel=self.kernel, curved_line=self.curved_line)
 
                     # print(len(contours_only_text_parent_h),len(contours_only_text_parent_h_d_ordered),'contours_only_text_parent_h')

From 7905b3b9d274a65889085a3a39da4437785899b6 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Fri, 29 Jan 2021 17:34:18 +0100
Subject: [PATCH 10/89] outfactor calculate_width_height_by_columns

---
 sbb_newspapers_org_image/eynollah.py | 136 +++++++++------------------
 1 file changed, 42 insertions(+), 94 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index f6bc1b2..607e44e 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -273,41 +273,7 @@ class eynollah:
         dpi = os.popen('identify -format "%x " ' + self.image_filename).read()
         return int(float(dpi))
 
-    def resize_image_with_column_classifier(self, is_image_enhanced):
-        img = cv2.imread(self.image_filename)
-        img = img.astype(np.uint8)
-
-        _, page_coord = self.early_page_for_num_of_column_classification()
-        model_num_classifier, session_col_classifier = self.start_new_session_and_model(self.model_dir_of_col_classifier)
-
-        img_1ch = cv2.imread(self.image_filename, 0)
-        width_early = img_1ch.shape[1]
-        img_1ch = img_1ch[page_coord[0] : page_coord[1], page_coord[2] : page_coord[3]]
-
-        # plt.imshow(img_1ch)
-        # plt.show()
-        img_1ch = img_1ch / 255.0
-
-        img_1ch = cv2.resize(img_1ch, (448, 448), interpolation=cv2.INTER_NEAREST)
-
-        img_in = np.zeros((1, img_1ch.shape[0], img_1ch.shape[1], 3))
-        img_in[0, :, :, 0] = img_1ch[:, :]
-        img_in[0, :, :, 1] = img_1ch[:, :]
-        img_in[0, :, :, 2] = img_1ch[:, :]
-
-        label_p_pred = model_num_classifier.predict(img_in)
-        num_col = np.argmax(label_p_pred[0]) + 1
-
-        print(num_col, label_p_pred, "num_col_classifier")
-
-        session_col_classifier.close()
-        del model_num_classifier
-        del session_col_classifier
-
-        K.clear_session()
-        gc.collect()
-
-        # sys.exit()
+    def calculate_width_height_by_columns(self, img, num_col, width_early):
         if num_col == 1 and width_early < 1100:
             img_w_new = 2000
             img_h_new = int(img.shape[0] / float(img.shape[1]) * 2000)
@@ -367,6 +333,45 @@ class eynollah:
             img_new = resize_image(img, img_h_new, img_w_new)
             num_column_is_classified = True
 
+        return img_new, num_column_is_classified
+
+    def resize_image_with_column_classifier(self, is_image_enhanced):
+        img = cv2.imread(self.image_filename)
+        img = img.astype(np.uint8)
+
+        _, page_coord = self.early_page_for_num_of_column_classification()
+        model_num_classifier, session_col_classifier = self.start_new_session_and_model(self.model_dir_of_col_classifier)
+
+        img_1ch = cv2.imread(self.image_filename, 0)
+        width_early = img_1ch.shape[1]
+        img_1ch = img_1ch[page_coord[0] : page_coord[1], page_coord[2] : page_coord[3]]
+
+        # plt.imshow(img_1ch)
+        # plt.show()
+        img_1ch = img_1ch / 255.0
+
+        img_1ch = cv2.resize(img_1ch, (448, 448), interpolation=cv2.INTER_NEAREST)
+
+        img_in = np.zeros((1, img_1ch.shape[0], img_1ch.shape[1], 3))
+        img_in[0, :, :, 0] = img_1ch[:, :]
+        img_in[0, :, :, 1] = img_1ch[:, :]
+        img_in[0, :, :, 2] = img_1ch[:, :]
+
+        label_p_pred = model_num_classifier.predict(img_in)
+        num_col = np.argmax(label_p_pred[0]) + 1
+
+        print(num_col, label_p_pred, "num_col_classifier")
+
+        session_col_classifier.close()
+        del model_num_classifier
+        del session_col_classifier
+
+        K.clear_session()
+        gc.collect()
+
+        # sys.exit()
+        img_new, num_column_is_classified = self.calculate_width_height_by_columns(img, num_col, width_early)
+
         if img_new.shape[1] > img.shape[1]:
             img_new = self.predict_enhancement(img_new)
             is_image_enhanced = True
@@ -423,64 +428,7 @@ class eynollah:
         if dpi < 298:
 
             # sys.exit()
-            if num_col == 1 and width_early < 1100:
-                img_w_new = 2000
-                img_h_new = int(img.shape[0] / float(img.shape[1]) * 2000)
-            elif num_col == 1 and width_early >= 2500:
-                img_w_new = 2000
-                img_h_new = int(img.shape[0] / float(img.shape[1]) * 2000)
-            elif num_col == 1 and width_early >= 1100 and width_early < 2500:
-                img_w_new = width_early
-                img_h_new = int(img.shape[0] / float(img.shape[1]) * width_early)
-            elif num_col == 2 and width_early < 2000:
-                img_w_new = 2400
-                img_h_new = int(img.shape[0] / float(img.shape[1]) * 2400)
-            elif num_col == 2 and width_early >= 3500:
-                img_w_new = 2400
-                img_h_new = int(img.shape[0] / float(img.shape[1]) * 2400)
-            elif num_col == 2 and width_early >= 2000 and width_early < 3500:
-                img_w_new = width_early
-                img_h_new = int(img.shape[0] / float(img.shape[1]) * width_early)
-            elif num_col == 3 and width_early < 2000:
-                img_w_new = 3000
-                img_h_new = int(img.shape[0] / float(img.shape[1]) * 3000)
-            elif num_col == 3 and width_early >= 4000:
-                img_w_new = 3000
-                img_h_new = int(img.shape[0] / float(img.shape[1]) * 3000)
-            elif num_col == 3 and width_early >= 2000 and width_early < 4000:
-                img_w_new = width_early
-                img_h_new = int(img.shape[0] / float(img.shape[1]) * width_early)
-            elif num_col == 4 and width_early < 2500:
-                img_w_new = 4000
-                img_h_new = int(img.shape[0] / float(img.shape[1]) * 4000)
-            elif num_col == 4 and width_early >= 5000:
-                img_w_new = 4000
-                img_h_new = int(img.shape[0] / float(img.shape[1]) * 4000)
-            elif num_col == 4 and width_early >= 2500 and width_early < 5000:
-                img_w_new = width_early
-                img_h_new = int(img.shape[0] / float(img.shape[1]) * width_early)
-            elif num_col == 5 and width_early < 3700:
-                img_w_new = 5000
-                img_h_new = int(img.shape[0] / float(img.shape[1]) * 5000)
-            elif num_col == 5 and width_early >= 7000:
-                img_w_new = 5000
-                img_h_new = int(img.shape[0] / float(img.shape[1]) * 5000)
-            elif num_col == 5 and width_early >= 3700 and width_early < 7000:
-                img_w_new = width_early
-                img_h_new = int(img.shape[0] / float(img.shape[1]) * width_early)
-            elif num_col == 6 and width_early < 4500:
-                img_w_new = 6500  # 5400
-                img_h_new = int(img.shape[0] / float(img.shape[1]) * 6500)
-            else:
-                img_w_new = width_early
-                img_h_new = int(img.shape[0] / float(img.shape[1]) * width_early)
-
-            if label_p_pred[0][int(num_col - 1)] < 0.9 and img_w_new < width_early:
-                img_new = np.copy(img)
-                num_column_is_classified = False
-            else:
-                img_new = resize_image(img, img_h_new, img_w_new)
-                num_column_is_classified = True
+            img_new, num_column_is_classified = self.calculate_width_height_by_columns(img, num_col, width_early)
 
             # img_new=resize_image(img,img_h_new,img_w_new)
             image_res = self.predict_enhancement(img_new)
@@ -1533,7 +1481,7 @@ class eynollah:
                 coord_text.set('points', self.calculate_polygon_coords(found_polygons_drop_capitals, mm, lmm, page_coord)
                 
                     
-                texteqreg=ET.SubElement(textregion, 'TextEquiv')
+                texteqreg = ET.SubElement(textregion, 'TextEquiv')
     
                 unireg=ET.SubElement(texteqreg, 'Unicode')
                 unireg.text = ' ' 

From b93c1923d0480f59de38d6be0fa6bf13e41b8f3c Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Fri, 29 Jan 2021 17:44:54 +0100
Subject: [PATCH 11/89] clean up empty lines, fix syntax error

---
 sbb_newspapers_org_image/eynollah.py | 110 +++------------------------
 1 file changed, 11 insertions(+), 99 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index 607e44e..487e0b9 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -744,9 +744,7 @@ class eynollah:
 
         if patches and cols == 2:
             img = otsu_copy_binary(img)  # otsu_copy(img)
-
             img = img.astype(np.uint8)
-
             if img_width_h >= 2000:
                 img = resize_image(img, int(img_height_h * 0.9), int(img_width_h * 0.9))
             else:
@@ -755,7 +753,6 @@ class eynollah:
 
         if patches and cols == 1:
             img = otsu_copy_binary(img)  # otsu_copy(img)
-
             img = img.astype(np.uint8)
             img = resize_image(img, int(img_height_h * 0.5), int(img_width_h * 0.5))
             img = img.astype(np.uint8)
@@ -766,13 +763,11 @@ class eynollah:
                 img = img.astype(np.uint8)
                 #img= self.resize_image(img, int(img_height_h*0.8), int(img_width_h*0.8) )
                 img= resize_image(img, int(img_height_h*2800/float(img_width_h)), 2800 )
-
             else:
                 img = otsu_copy_binary(img)#self.otsu_copy(img)
                 img = img.astype(np.uint8)
                 #img= self.resize_image(img, int(img_height_h*0.9), int(img_width_h*0.9) )
-                
-            
+
         if patches and cols==4:
             #print(self.scale_x,img_width_h,'scale')
             if (self.scale_x==1 and img_width_h>4000) or (self.scale_x!=1 and img_width_h>3700):
@@ -784,7 +779,7 @@ class eynollah:
                 img = otsu_copy_binary(img)#self.otsu_copy(img)
                 img = img.astype(np.uint8)
                 img= resize_image(img, int(img_height_h*0.9), int(img_width_h*0.9) )
-            
+
         if patches and cols==5:
             if (self.scale_x==1 and img_width_h>5000):
                 img = otsu_copy_binary(img)#self.otsu_copy(img)
@@ -795,7 +790,7 @@ class eynollah:
                 img = otsu_copy_binary(img)#self.otsu_copy(img)
                 img = img.astype(np.uint8)
                 img= resize_image(img, int(img_height_h*0.9), int(img_width_h*0.9) )
-                
+
         if patches and cols>=6:
             if img_width_h>5600:
                 img = otsu_copy_binary(img)#self.otsu_copy(img)
@@ -943,28 +938,21 @@ class eynollah:
 
             all_text_region_raw = textline_mask_tot_ea[boxes_text[mv][1] : boxes_text[mv][1] + boxes_text[mv][3], boxes_text[mv][0] : boxes_text[mv][0] + boxes_text[mv][2]]
             all_text_region_raw = all_text_region_raw.astype(np.uint8)
-
             img_int_p = all_text_region_raw[:, :]  # self.all_text_region_raw[mv]
 
             ##img_int_p=cv2.erode(img_int_p,self.kernel,iterations = 2)
-
             # plt.imshow(img_int_p)
             # plt.show()
 
             if img_int_p.shape[0] / img_int_p.shape[1] < 0.1:
-
                 slopes_per_each_subprocess.append(0)
-
                 slope_first = 0
                 slope_for_all = [slope_deskew][0]
-
             else:
-
                 try:
                     textline_con, hierachy = return_contours_of_image(img_int_p)
                     textline_con_fil = filter_contours_area_of_image(img_int_p, textline_con, hierachy, max_area=1, min_area=0.0008)
                     y_diff_mean = find_contours_mean_y_diff(textline_con_fil)
-
                     sigma_des = int(y_diff_mean * (4.0 / 40.0))
 
                     if sigma_des < 1:
@@ -979,37 +967,26 @@ class eynollah:
                     # old method
                     # slope_for_all=self.textline_contours_to_get_slope_correctly(self.all_text_region_raw[mv],denoised,contours[mv])
                     # text_patch_processed=textline_contours_postprocessing(gada)
-
                 except:
                     slope_for_all = 999
 
-                ##slope_for_all=return_deskew_slop(img_int_p,sigma_des, dir_of_all=self.dir_of_all, image_filename_stem=self.image_filename_stem)
-
                 if slope_for_all == 999:
                     slope_for_all = [slope_deskew][0]
-                ##if np.abs(slope_for_all)>32.5 and slope_for_all!=999:
-                ##slope_for_all=slope_biggest
-                ##elif slope_for_all==999:
-                ##slope_for_all=slope_biggest
                 slopes_per_each_subprocess.append(slope_for_all)
 
             index_by_text_region_contours.append(indexes_r_con_per_pro[mv])
-
             crop_img, crop_coor = crop_image_inside_box(boxes_text[mv], image_page_rotated)
+
             if abs(slope_for_all) < 45:
 
                 # all_box_coord.append(crop_coor)
 
                 textline_region_in_image = np.zeros(textline_mask_tot_ea.shape)
                 cnt_o_t_max = contours_par_per_process[mv]
-
                 x, y, w, h = cv2.boundingRect(cnt_o_t_max)
-
                 mask_biggest = np.zeros(mask_texts_only.shape)
                 mask_biggest = cv2.fillPoly(mask_biggest, pts=[cnt_o_t_max], color=(1, 1, 1))
-
                 mask_region_in_patch_region = mask_biggest[y : y + h, x : x + w]
-
                 textline_biggest_region = mask_biggest * textline_mask_tot_ea
 
                 # print(slope_for_all,'slope_for_all')
@@ -1025,7 +1002,6 @@ class eynollah:
 
                 # plt.imshow(textline_region_in_image)
                 # plt.show()
-
                 # plt.imshow(textline_cnt_seperated)
                 # plt.show()
 
@@ -1039,21 +1015,16 @@ class eynollah:
                     if num_col + 1 == 1:
                         mask_biggest2 = cv2.dilate(mask_biggest2, self.kernel, iterations=5)
                     else:
-
                         mask_biggest2 = cv2.dilate(mask_biggest2, self.kernel, iterations=4)
 
                     pixel_img = 1
-
                     mask_biggest2 = resize_image(mask_biggest2, int(mask_biggest2.shape[0] * scale_par), int(mask_biggest2.shape[1] * scale_par))
-
                     cnt_textlines_in_image_ind = return_contours_of_interested_textline(mask_biggest2, pixel_img)
-
                     try:
                         # textlines_cnt_per_region.append(cnt_textlines_in_image_ind[0]/scale_par)
                         textlines_cnt_per_region.append(cnt_textlines_in_image_ind[0])
                     except:
                         pass
-
             else:
                 slope_first = 0
                 add_boxes_coor_into_textlines = True
@@ -1061,13 +1032,8 @@ class eynollah:
                 add_boxes_coor_into_textlines = False
                 # print(np.shape(textlines_cnt_per_region),'textlines_cnt_per_region')
 
-            # textlines_cnt_tot_per_process.append(textlines_cnt_per_region)
-            # index_polygons_per_process_per_process.append(index_polygons_per_process[iiii])
-
             textlines_rectangles_per_each_subprocess.append(textlines_cnt_per_region)
-            # all_found_texline_polygons.append(cnt_clean_rot)
             bounding_box_of_textregion_per_each_subprocess.append(boxes_text[mv])
-
             contours_textregion_per_each_subprocess.append(contours_per_process[mv])
             contours_textregion_par_per_each_subprocess.append(contours_par_per_process[mv])
             all_box_coord_per_process.append(crop_coor)
@@ -1086,74 +1052,42 @@ class eynollah:
         slope_biggest = 0
 
         for mv in range(len(boxes_text)):
-            
             crop_img,crop_coor=crop_image_inside_box(boxes_text[mv],image_page_rotated)
-
-            #all_box_coord.append(crop_coor)
-            
             mask_textline=np.zeros((textline_mask_tot_ea.shape))
-            
             mask_textline=cv2.fillPoly(mask_textline,pts=[contours_per_process[mv]],color=(1,1,1))
-            
-            
-        
             denoised=None
             all_text_region_raw=(textline_mask_tot_ea*mask_textline[:,:])[boxes_text[mv][1]:boxes_text[mv][1]+boxes_text[mv][3] , boxes_text[mv][0]:boxes_text[mv][0]+boxes_text[mv][2] ]
             all_text_region_raw=all_text_region_raw.astype(np.uint8)
-            
             img_int_p=all_text_region_raw[:,:]#self.all_text_region_raw[mv]
-            
             img_int_p=cv2.erode(img_int_p,self.kernel,iterations = 2)
-            
+
             if img_int_p.shape[0]/img_int_p.shape[1]<0.1:
-                
                 slopes_per_each_subprocess.append(0)
-
                 slope_for_all = [slope_deskew][0]
-
                 all_text_region_raw = textline_mask_tot_ea[boxes_text[mv][1] : boxes_text[mv][1] + boxes_text[mv][3], boxes_text[mv][0] : boxes_text[mv][0] + boxes_text[mv][2]]
                 cnt_clean_rot = textline_contours_postprocessing(all_text_region_raw, slope_for_all, contours_par_per_process[mv], boxes_text[mv], 0)
-
                 textlines_rectangles_per_each_subprocess.append(cnt_clean_rot)
-
                 index_by_text_region_contours.append(indexes_r_con_per_pro[mv])
-                # all_found_texline_polygons.append(cnt_clean_rot)
                 bounding_box_of_textregion_per_each_subprocess.append(boxes_text[mv])
             else:
-
                 try:
                     textline_con, hierachy = return_contours_of_image(img_int_p)
                     textline_con_fil = filter_contours_area_of_image(img_int_p, textline_con, hierachy, max_area=1, min_area=0.00008)
-
                     y_diff_mean = find_contours_mean_y_diff(textline_con_fil)
-
                     sigma_des = int(y_diff_mean * (4.0 / 40.0))
-
                     if sigma_des < 1:
                         sigma_des = 1
-
                     img_int_p[img_int_p > 0] = 1
-                    # slope_for_all=self.return_deskew_slope_new(img_int_p,sigma_des)
                     slope_for_all = return_deskew_slop(img_int_p, sigma_des, dir_of_all=self.dir_of_all, image_filename_stem=self.image_filename_stem)
-
                     if abs(slope_for_all) <= 0.5:
                         slope_for_all = [slope_deskew][0]
-
                 except:
                     slope_for_all = 999
 
-                ##slope_for_all=return_deskew_slop(img_int_p,sigma_des, dir_of_all=self.dir_of_all, image_filename_stem=self.image_filename_stem)
-
                 if slope_for_all == 999:
                     slope_for_all = [slope_deskew][0]
-                ##if np.abs(slope_for_all)>32.5 and slope_for_all!=999:
-                ##slope_for_all=slope_biggest
-                ##elif slope_for_all==999:
-                ##slope_for_all=slope_biggest
                 slopes_per_each_subprocess.append(slope_for_all)
-
                 slope_first = 0
-
                 mask_only_con_region = np.zeros(textline_mask_tot_ea.shape)
                 mask_only_con_region = cv2.fillPoly(mask_only_con_region, pts=[contours_par_per_process[mv]], color=(1, 1, 1))
 
@@ -1166,7 +1100,6 @@ class eynollah:
                 ##plt.show()
                 ##plt.imshow(all_text_region_raw)
                 ##plt.show()
-
                 ##plt.imshow(mask_only_con_region)
                 ##plt.show()
 
@@ -1175,7 +1108,6 @@ class eynollah:
 
                 textlines_rectangles_per_each_subprocess.append(cnt_clean_rot)
                 index_by_text_region_contours.append(indexes_r_con_per_pro[mv])
-                # all_found_texline_polygons.append(cnt_clean_rot)
                 bounding_box_of_textregion_per_each_subprocess.append(boxes_text[mv])
 
             contours_textregion_per_each_subprocess.append(contours_per_process[mv])
@@ -1190,38 +1122,22 @@ class eynollah:
             model_textline, session_textline = self.start_new_session_and_model(self.model_textline_dir)
         if not patches:
             model_textline, session_textline = self.start_new_session_and_model(self.model_textline_dir_np)
-
-        ##img = otsu_copy(img)
         img = img.astype(np.uint8)
-
         img_org = np.copy(img)
         img_h = img_org.shape[0]
         img_w = img_org.shape[1]
-
         img = resize_image(img_org, int(img_org.shape[0] * scaler_h), int(img_org.shape[1] * scaler_w))
-
         prediction_textline = self.do_prediction(patches, img, model_textline)
-
         prediction_textline = resize_image(prediction_textline, img_h, img_w)
-
         patches = False
         prediction_textline_longshot = self.do_prediction(patches, img, model_textline)
-
         prediction_textline_longshot_true_size = resize_image(prediction_textline_longshot, img_h, img_w)
 
-        # scaler_w=1.5
-        # scaler_h=1.5
-        # patches=True
-        # img= resize_image(img_org, int(img_org.shape[0]*scaler_h), int(img_org.shape[1]*scaler_w))
-
         # prediction_textline_streched=self.do_prediction(patches,img,model_textline)
-
         # prediction_textline_streched= resize_image(prediction_textline_streched, img_h, img_w)
-
         ##plt.imshow(prediction_textline_streched[:,:,0])
         ##plt.show()
 
-        # sys.exit()
         session_textline.close()
 
         del model_textline
@@ -1261,10 +1177,6 @@ class eynollah:
 
             if slope_corresponding_textregion == 999:
                 slope_corresponding_textregion = slope_biggest
-            ##if np.abs(slope_corresponding_textregion)>12.5 and slope_corresponding_textregion!=999:
-            ##slope_corresponding_textregion=slope_biggest
-            ##elif slope_corresponding_textregion==999:
-            ##slope_corresponding_textregion=slope_biggest
             slopes_sub.append(slope_corresponding_textregion)
 
             cnt_clean_rot = textline_contours_postprocessing(crop_img, slope_corresponding_textregion, contours_per_process[mv], boxes_per_process[mv])
@@ -1478,11 +1390,11 @@ class eynollah:
                 #else:
                 #    textregion.set('type','paragraph')
                 coord_text = ET.SubElement(textregion, 'Coords')
-                coord_text.set('points', self.calculate_polygon_coords(found_polygons_drop_capitals, mm, lmm, page_coord)
+                coord_text.set('points', self.calculate_polygon_coords(found_polygons_drop_capitals, mm, lmm, page_coord))
                 
                     
-                texteqreg = ET.SubElement(textregion, 'TextEquiv')
     
+                texteqreg = ET.SubElement(textregion, 'TextEquiv')
                 unireg=ET.SubElement(texteqreg, 'Unicode')
                 unireg.text = ' ' 
                 
@@ -1507,7 +1419,7 @@ class eynollah:
                 #else:
                 #    textregion.set('type','paragraph')
                 coord_text = ET.SubElement(textregion, 'Coords')
-                coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals, mm, lmm, page_coord)
+                coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals, mm, lmm, page_coord))
 
                 for j in range(len(all_found_texline_polygons_marginals[mm])):
     
@@ -1583,7 +1495,7 @@ class eynollah:
                 textregion.set('id','r'+str(id_indexer))
                 id_indexer+=1
                 coord_text = ET.SubElement(textregion, 'Coords')
-                coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_img, mm, lmm, page_coord)
+                coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_img, mm, lmm, page_coord))
         except:
             pass
 
@@ -1595,7 +1507,7 @@ class eynollah:
                 textregion.set('id','r'+str(id_indexer))
                 id_indexer+=1
                 coord_text = ET.SubElement(textregion, 'Coords')
-                coord_text.set('points', self.calculate_polygon_coords(found_polygons_tables, mm, lmm, page_coord)
+                coord_text.set('points', self.calculate_polygon_coords(found_polygons_tables, mm, lmm, page_coord))
         except:
             pass
 
@@ -1770,7 +1682,7 @@ class eynollah:
                 textregion.set('id', id_of_marginalia[mm])
                 textregion.set('type', 'marginalia')
                 coord_text = ET.SubElement(textregion, 'Coords')
-                coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals, mm, lmm, page_coord)
+                coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals, mm, lmm, page_coord))
                 for j in range(len(all_found_texline_polygons_marginals[mm])):
                     textline=ET.SubElement(textregion, 'TextLine')
                     textline.set('id','l'+str(id_indexer_l))

From 5a46e7ed279a78a60aa09ada6f92db898e560188 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Mon, 1 Feb 2021 11:27:33 +0100
Subject: [PATCH 12/89] fix signature of calculate_polygon_coords

---
 sbb_newspapers_org_image/eynollah.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index 487e0b9..03cb699 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -1266,9 +1266,9 @@ class eynollah:
             coord.set('points',points_co)
         return id_indexer_l
 
-    def calculate_polygon_coords(self, contour_list, i, j, page_coord):
+    def calculate_polygon_coords(self, contour_list, i, page_coord):
         coords = ''
-        for lmm in range(len(contour_list[i])):
+        for j in range(len(contour_list[i])):
             if len(contour_list[i][j]) == 2:
                 coords += str(int((contour_list[i][j][0] + page_coord[2]) / self.scale_x))
                 coords += ','
@@ -1278,7 +1278,7 @@ class eynollah:
                 coords += ','
                 coords += str(int((contour_list[i][j][0][1] + page_coord[0]) / self.scale_y))
 
-            if j < len(contour_list[mm]) - 1:
+            if j < len(contour_list[i]) - 1:
                 coords=coords+' '
         #print(coords)
         return coords
@@ -1332,7 +1332,7 @@ class eynollah:
                 textregion.set('type','paragraph')
                 coord_text = ET.SubElement(textregion, 'Coords')
 
-                coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region, mm, lmm, page_coord))
+                coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region, mm, page_coord))
 
                 id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, id_indexer_l)
                 texteqreg=ET.SubElement(textregion, 'TextEquiv')
@@ -1390,7 +1390,7 @@ class eynollah:
                 #else:
                 #    textregion.set('type','paragraph')
                 coord_text = ET.SubElement(textregion, 'Coords')
-                coord_text.set('points', self.calculate_polygon_coords(found_polygons_drop_capitals, mm, lmm, page_coord))
+                coord_text.set('points', self.calculate_polygon_coords(found_polygons_drop_capitals, mm, page_coord))
                 
                     
     
@@ -1419,7 +1419,7 @@ class eynollah:
                 #else:
                 #    textregion.set('type','paragraph')
                 coord_text = ET.SubElement(textregion, 'Coords')
-                coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals, mm, lmm, page_coord))
+                coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals, mm, page_coord))
 
                 for j in range(len(all_found_texline_polygons_marginals[mm])):
     
@@ -1495,7 +1495,7 @@ class eynollah:
                 textregion.set('id','r'+str(id_indexer))
                 id_indexer+=1
                 coord_text = ET.SubElement(textregion, 'Coords')
-                coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_img, mm, lmm, page_coord))
+                coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_img, mm, page_coord))
         except:
             pass
 
@@ -1507,7 +1507,7 @@ class eynollah:
                 textregion.set('id','r'+str(id_indexer))
                 id_indexer+=1
                 coord_text = ET.SubElement(textregion, 'Coords')
-                coord_text.set('points', self.calculate_polygon_coords(found_polygons_tables, mm, lmm, page_coord))
+                coord_text.set('points', self.calculate_polygon_coords(found_polygons_tables, mm, page_coord))
         except:
             pass
 
@@ -1590,7 +1590,7 @@ class eynollah:
                 #else:
                 #    textregion.set('type','paragraph')
                 coord_text = ET.SubElement(textregion, 'Coords')
-                coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region, mm, lmm, page_coord))
+                coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region, mm, page_coord))
                 
                 
                 
@@ -1682,7 +1682,7 @@ class eynollah:
                 textregion.set('id', id_of_marginalia[mm])
                 textregion.set('type', 'marginalia')
                 coord_text = ET.SubElement(textregion, 'Coords')
-                coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals, mm, lmm, page_coord))
+                coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals, mm, page_coord))
                 for j in range(len(all_found_texline_polygons_marginals[mm])):
                     textline=ET.SubElement(textregion, 'TextLine')
                     textline.set('id','l'+str(id_indexer_l))

From 28d35f8e6bf61b1e8708a3535f5e86796f999310 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Mon, 1 Feb 2021 12:07:50 +0100
Subject: [PATCH 13/89] start simplifying copy/paste coordinate logic

---
 sbb_newspapers_org_image/eynollah.py | 49 ++++------------------------
 1 file changed, 7 insertions(+), 42 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index 5981aa4..c53db5c 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -1422,31 +1422,16 @@ class eynollah:
                 coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals, mm, page_coord))
 
                 for j in range(len(all_found_texline_polygons_marginals[mm])):
-    
                     textline=ET.SubElement(textregion, 'TextLine')
-                    
                     textline.set('id','l'+str(id_indexer_l))
-                    
                     id_indexer_l+=1
-                    
-    
                     coord = ET.SubElement(textline, 'Coords')
-    
                     texteq=ET.SubElement(textline, 'TextEquiv')
-    
                     uni=ET.SubElement(texteq, 'Unicode')
                     uni.text = ' ' 
-    
-                    #points = ET.SubElement(coord, 'Points') 
-    
                     points_co=''
                     for l in range(len(all_found_texline_polygons_marginals[mm][j])):
-                        #point = ET.SubElement(coord, 'Point') 
-    
-    
                         if not self.curved_line:
-                            #point.set('x',str(found_polygons[j][l][0]))  
-                            #point.set('y',str(found_polygons[j][l][1]))
                             if len(all_found_texline_polygons_marginals[mm][j][l])==2:
                                 points_co=points_co+str( int( (all_found_texline_polygons_marginals[mm][j][l][0]
                                                         +all_box_coord_marginals[mm][2]+page_coord[2])/self.scale_x) )
@@ -1459,8 +1444,7 @@ class eynollah:
                                 points_co=points_co+','
                                 points_co=points_co+str( int( ( all_found_texline_polygons_marginals[mm][j][l][0][1] 
                                                         +all_box_coord_marginals[mm][0]+page_coord[0])/self.scale_y) ) 
-                                                
-                        if self.curved_line :
+                        else:
                             if len(all_found_texline_polygons_marginals[mm][j][l])==2:
                                 points_co=points_co+str( int( (all_found_texline_polygons_marginals[mm][j][l][0]
                                                         +page_coord[2])/self.scale_x) )
@@ -1610,29 +1594,13 @@ class eynollah:
                             #point.set('x',str(found_polygons[j][l][0]))  
                             #point.set('y',str(found_polygons[j][l][1]))
                             if len(all_found_texline_polygons[mm][j][l]) == 2:
-                                textline_x_coord = int( (all_found_texline_polygons[mm][j][l][0]
-                                                        + all_box_coord[mm][2] + page_coord[2]) / self.scale_x)
-                                textline_y_coord=int( (all_found_texline_polygons[mm][j][l][1] 
-                                                        + all_box_coord[mm][0] + page_coord[0]) / self.scale_y)
-                                if textline_x_coord < 0:
-                                    textline_x_coord = 0
-                                if textline_y_coord < 0:
-                                    textline_y_coord = 0
-                                points_co = points_co + str( textline_x_coord )
-                                points_co = points_co + ','
-                                points_co = points_co + str( textline_y_coord )
+                                textline_x_coord = max(0, int((all_found_texline_polygons[mm][j][l][0] + all_box_coord[mm][2] + page_coord[2]) / self.scale_x))
+                                textline_y_coord = max(0, int((all_found_texline_polygons[mm][j][l][1] + all_box_coord[mm][0] + page_coord[0]) / self.scale_y))
+                                points_co += str(textline_x_coord) + ',' + str(textline_y_coord)
                             else:
-                                textline_x_coord = int( ( all_found_texline_polygons[mm][j][l][0][0] 
-                                                        + all_box_coord[mm][2]+page_coord[2])/self.scale_x )
-                                textline_y_coord=int( ( all_found_texline_polygons[mm][j][l][0][1] 
-                                                        +all_box_coord[mm][0]+page_coord[0])/self.scale_y)
-                                if textline_x_coord < 0:
-                                    textline_x_coord = 0
-                                if textline_y_coord < 0:
-                                    textline_y_coord = 0
-                                points_co = points_co + str( textline_x_coord )
-                                points_co = points_co + ','
-                                points_co = points_co + str( textline_y_coord ) 
+                                textline_x_coord = max(0, int((all_found_texline_polygons[mm][j][l][0][0] + all_box_coord[mm][2]+page_coord[2]) / self.scale_x))
+                                textline_y_coord = max(0, int((all_found_texline_polygons[mm][j][l][0][1] + all_box_coord[mm][0]+page_coord[0]) / self.scale_y))
+                                points_co += str(textline_x_coord) + ',' + str(textline_y_coord)
 
                         if (self.curved_line) and abs(slopes[mm]) <= 45:
                             if len(all_found_texline_polygons[mm][j][l]) == 2:
@@ -1691,12 +1659,9 @@ class eynollah:
                     texteq = ET.SubElement(textline, 'TextEquiv')
                     uni = ET.SubElement(texteq, 'Unicode')
                     uni.text = ' ' 
-                    #points = ET.SubElement(coord, 'Points') 
                     points_co=''
                     for l in range(len(all_found_texline_polygons_marginals[mm][j])):
                         if not curved_line:
-                            #point.set('x',str(found_polygons[j][l][0]))  
-                            #point.set('y',str(found_polygons[j][l][1]))
                             if len(all_found_texline_polygons_marginals[mm][j][l])==2:
                                 points_co=points_co+str( int( (all_found_texline_polygons_marginals[mm][j][l][0]
                                                         +all_box_coord_marginals[mm][2]+page_coord[2])/self.scale_x) )

From 0bd9a10525c5e63b8dde97ae7fe5304bfee69f53 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Mon, 1 Feb 2021 12:54:10 +0100
Subject: [PATCH 14/89] continue simplifying copy/paste coordinate logic

---
 sbb_newspapers_org_image/eynollah.py | 115 ++++++++++-----------------
 1 file changed, 41 insertions(+), 74 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index c53db5c..15ecce2 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -1506,13 +1506,13 @@ class eynollah:
         points_page_print = ""
         for lmm in range(len(self.cont_page[0])):
             if len(self.cont_page[0][lmm]) == 2:
-                points_page_print = points_page_print + str(int((self.cont_page[0][lmm][0] ) / self.scale_x))
-                points_page_print = points_page_print + ','
-                points_page_print = points_page_print + str(int((self.cont_page[0][lmm][1] ) / self.scale_y))
+                points_page_print += str(int((self.cont_page[0][lmm][0] ) / self.scale_x))
+                points_page_print += ','
+                points_page_print += str(int((self.cont_page[0][lmm][1] ) / self.scale_y))
             else:
-                points_page_print = points_page_print + str(int((self.cont_page[0][lmm][0][0]) / self.scale_x))
-                points_page_print = points_page_print + ','
-                points_page_print = points_page_print + str(int((self.cont_page[0][lmm][0][1] ) / self.scale_y))
+                points_page_print += str(int((self.cont_page[0][lmm][0][0]) / self.scale_x))
+                points_page_print += ','
+                points_page_print += str(int((self.cont_page[0][lmm][0][1] ) / self.scale_y))
 
             if lmm < (len( self.cont_page[0] ) - 1):
                 points_page_print = points_page_print + ' '
@@ -1564,20 +1564,11 @@ class eynollah:
     
             for mm in range(len(found_polygons_text_region)):
                 textregion=ET.SubElement(page, 'TextRegion')
-    
                 textregion.set('id', 'r'+str(id_indexer))
                 id_indexer += 1
-                
                 textregion.set('type', 'paragraph')
-                #if mm==0:
-                #    textregion.set('type','header')
-                #else:
-                #    textregion.set('type','paragraph')
                 coord_text = ET.SubElement(textregion, 'Coords')
                 coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region, mm, page_coord))
-                
-                
-                
                 for j in range(len(all_found_texline_polygons[mm])):
                     textline=ET.SubElement(textregion, 'TextLine')
                     textline.set('id', 'l' + str(id_indexer_l))
@@ -1586,13 +1577,10 @@ class eynollah:
                     texteq=ET.SubElement(textline, 'TextEquiv')
                     uni=ET.SubElement(texteq, 'Unicode')
                     uni.text = ' ' 
-                    #points = ET.SubElement(coord, 'Points') 
                     points_co=''
                     for l in range(len(all_found_texline_polygons[mm][j])):
                         #point = ET.SubElement(coord, 'Point') 
                         if not curved_line:
-                            #point.set('x',str(found_polygons[j][l][0]))  
-                            #point.set('y',str(found_polygons[j][l][1]))
                             if len(all_found_texline_polygons[mm][j][l]) == 2:
                                 textline_x_coord = max(0, int((all_found_texline_polygons[mm][j][l][0] + all_box_coord[mm][2] + page_coord[2]) / self.scale_x))
                                 textline_y_coord = max(0, int((all_found_texline_polygons[mm][j][l][1] + all_box_coord[mm][0] + page_coord[0]) / self.scale_y))
@@ -1601,38 +1589,27 @@ class eynollah:
                                 textline_x_coord = max(0, int((all_found_texline_polygons[mm][j][l][0][0] + all_box_coord[mm][2]+page_coord[2]) / self.scale_x))
                                 textline_y_coord = max(0, int((all_found_texline_polygons[mm][j][l][0][1] + all_box_coord[mm][0]+page_coord[0]) / self.scale_y))
                                 points_co += str(textline_x_coord) + ',' + str(textline_y_coord)
-
-                        if (self.curved_line) and abs(slopes[mm]) <= 45:
+                        if curved_line and abs(slopes[mm]) <= 45:
                             if len(all_found_texline_polygons[mm][j][l]) == 2:
-                                points_co=points_co + str( int( (all_found_texline_polygons[mm][j][l][0]
-                                                        + page_coord[2]) / self.scale_x) )
-                                points_co = points_co + ','
-                                points_co = points_co + str( int( (all_found_texline_polygons[mm][j][l][1] 
-                                                        + page_coord[0]) / self.scale_y) )
+                                points_co += str(int((all_found_texline_polygons[mm][j][l][0] + page_coord[2]) / self.scale_x))
+                                points_co += ','
+                                points_co += str(int((all_found_texline_polygons[mm][j][l][1] + page_coord[0]) / self.scale_y))
                             else:
-                                points_co = points_co + str( int( ( all_found_texline_polygons[mm][j][l][0][0] 
-                                                        + page_coord[2]) / self.scale_x ) )
+                                points_co = points_co + str(int((all_found_texline_polygons[mm][j][l][0][0] + page_coord[2]) / self.scale_x))
                                 points_co = points_co + ','
-                                points_co = points_co + str( int( ( all_found_texline_polygons[mm][j][l][0][1] 
-                                                        + page_coord[0]) / self.scale_y) )
-                                
-                        elif (self.curved_line) and abs(slopes[mm]) > 45:
+                                points_co = points_co + str(int((all_found_texline_polygons[mm][j][l][0][1] + page_coord[0]) / self.scale_y))
+                        elif curved_line and abs(slopes[mm]) > 45:
                             if len(all_found_texline_polygons[mm][j][l]) == 2:
-                                points_co = points_co + str( int( (all_found_texline_polygons[mm][j][l][0]
-                                                        + all_box_coord[mm][2] + page_coord[2]) / self.scale_x) )
-                                points_co = points_co + ','
-                                points_co = points_co + str( int( (all_found_texline_polygons[mm][j][l][1] 
-                                                        + all_box_coord[mm][0] + page_coord[0]) / self.scale_y) )
+                                points_co += str(int((all_found_texline_polygons[mm][j][l][0] + all_box_coord[mm][2] + page_coord[2]) / self.scale_x))
+                                points_co += ','
+                                points_co += str(int((all_found_texline_polygons[mm][j][l][1] + all_box_coord[mm][0] + page_coord[0]) / self.scale_y))
                             else:
-                                points_co = points_co + str( int( ( all_found_texline_polygons[mm][j][l][0][0] 
-                                                        + all_box_coord[mm][2] + page_coord[2]) / self.scale_x ) )
-                                points_co = points_co + ','
-                                points_co = points_co+str( int( ( all_found_texline_polygons[mm][j][l][0][1] 
-                                                        + all_box_coord[mm][0] + page_coord[0]) / self.scale_y) )
+                                points_co += str(int((all_found_texline_polygons[mm][j][l][0][0] + all_box_coord[mm][2] + page_coord[2]) / self.scale_x))
+                                points_co += ','
+                                points_co += str(int((all_found_texline_polygons[mm][j][l][0][1] + all_box_coord[mm][0] + page_coord[0]) / self.scale_y))
 
-                        if l < (len(all_found_texline_polygons[mm][j]) - 1):
-                            points_co = points_co + ' '
-                    #print(points_co)
+                        if l < len(all_found_texline_polygons[mm][j]) - 1:
+                            points_co += ' '
                     coord.set('points', points_co)
 
                 texteqreg = ET.SubElement(textregion, 'TextEquiv')
@@ -1662,34 +1639,25 @@ class eynollah:
                     points_co=''
                     for l in range(len(all_found_texline_polygons_marginals[mm][j])):
                         if not curved_line:
-                            if len(all_found_texline_polygons_marginals[mm][j][l])==2:
-                                points_co=points_co+str( int( (all_found_texline_polygons_marginals[mm][j][l][0]
-                                                        +all_box_coord_marginals[mm][2]+page_coord[2])/self.scale_x) )
-                                points_co=points_co+','
-                                points_co=points_co+str( int( (all_found_texline_polygons_marginals[mm][j][l][1] 
-                                                        +all_box_coord_marginals[mm][0]+page_coord[0])/self.scale_y) )
+                            if len(all_found_texline_polygons_marginals[mm][j][l]) == 2:
+                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0] + all_box_coord_marginals[mm][2] + page_coord[2]) / self.scale_x))
+                                points_co += ','
+                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][1] + all_box_coord_marginals[mm][0] + page_coord[0]) / self.scale_y))
                             else:
-                                points_co=points_co+str( int( ( all_found_texline_polygons_marginals[mm][j][l][0][0] 
-                                                        +all_box_coord_marginals[mm][2]+page_coord[2])/self.scale_x ) )
-                                points_co=points_co+','
-                                points_co=points_co+str( int( ( all_found_texline_polygons_marginals[mm][j][l][0][1] 
-                                                        +all_box_coord_marginals[mm][0]+page_coord[0])/self.scale_y) ) 
+                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0][0] + all_box_coord_marginals[mm][2] + page_coord[2]) / self.scale_x))
+                                points_co += ','
+                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0][1] + all_box_coord_marginals[mm][0] + page_coord[0])/self.scale_y)) 
                         else:
                             if len(all_found_texline_polygons_marginals[mm][j][l])==2:
-                                points_co=points_co+str( int( (all_found_texline_polygons_marginals[mm][j][l][0]
-                                                        +page_coord[2])/self.scale_x) )
-                                points_co=points_co+','
-                                points_co=points_co+str( int( (all_found_texline_polygons_marginals[mm][j][l][1] 
-                                                        +page_coord[0])/self.scale_y) )
+                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0] + page_coord[2]) / self.scale_x))
+                                points_co += ','
+                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][1] + page_coord[0]) / self.scale_y))
                             else:
-                                points_co=points_co+str( int( ( all_found_texline_polygons_marginals[mm][j][l][0][0] 
-                                                        +page_coord[2])/self.scale_x ) )
-                                points_co=points_co+','
-                                points_co=points_co+str( int( ( all_found_texline_polygons_marginals[mm][j][l][0][1] 
-                                                        +page_coord[0])/self.scale_y) ) 
-                        if l<(len(all_found_texline_polygons_marginals[mm][j])-1):
-                            points_co=points_co+' '
-                    #print(points_co)
+                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0][0] + page_coord[2]) / self.scale_x))
+                                points_co += ','
+                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0][1] + page_coord[0]) / self.scale_y)) 
+                        if l < len(all_found_texline_polygons_marginals[mm][j]) - 1:
+                            points_co += ' '
                     coord.set('points',points_co)
         except:
             pass
@@ -1706,13 +1674,12 @@ class eynollah:
                 coord_text = ET.SubElement(textregion, 'Coords')
                 points_co=''
                 for lmm in range(len(found_polygons_text_region_img[mm])):
-                    points_co=points_co+str( int( (found_polygons_text_region_img[mm][lmm,0,0]+page_coord[2] )/self.scale_x ) )
+                    points_co=points_co+str(int((found_polygons_text_region_img[mm][lmm,0,0] + page_coord[2]) / self.scale_x))
                     points_co=points_co+','
-                    points_co=points_co+str( int( (found_polygons_text_region_img[mm][lmm,0,1]+page_coord[0] )/self.scale_y ) )
-
-                    if lmm<(len(found_polygons_text_region_img[mm])-1):
-                        points_co=points_co+' '
-                coord_text.set('points',points_co)
+                    points_co=points_co+str(int((found_polygons_text_region_img[mm][lmm,0,1] + page_coord[0]) / self.scale_y))
+                    if lmm < len(found_polygons_text_region_img[mm]) - 1:
+                        points_co += ' '
+                coord_text.set('points', points_co)
         except:
             pass
 

From 8f0455590ad75742e5cc109d1ac887819439862c Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Mon, 1 Feb 2021 12:55:54 +0100
Subject: [PATCH 15/89] fix signature of calculate_width_height_by_columns

---
 sbb_newspapers_org_image/eynollah.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index 15ecce2..6b71b32 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -273,7 +273,7 @@ class eynollah:
         dpi = os.popen('identify -format "%x " ' + self.image_filename).read()
         return int(float(dpi))
 
-    def calculate_width_height_by_columns(self, img, num_col, width_early):
+    def calculate_width_height_by_columns(self, img, num_col, width_early, label_p_pred):
         if num_col == 1 and width_early < 1100:
             img_w_new = 2000
             img_h_new = int(img.shape[0] / float(img.shape[1]) * 2000)
@@ -370,7 +370,7 @@ class eynollah:
         gc.collect()
 
         # sys.exit()
-        img_new, num_column_is_classified = self.calculate_width_height_by_columns(img, num_col, width_early)
+        img_new, num_column_is_classified = self.calculate_width_height_by_columns(img, num_col, width_early, label_p_pred)
 
         if img_new.shape[1] > img.shape[1]:
             img_new = self.predict_enhancement(img_new)
@@ -428,7 +428,7 @@ class eynollah:
         if dpi < 298:
 
             # sys.exit()
-            img_new, num_column_is_classified = self.calculate_width_height_by_columns(img, num_col, width_early)
+            img_new, num_column_is_classified = self.calculate_width_height_by_columns(img, num_col, width_early, label_p_pred)
 
             # img_new=resize_image(img,img_h_new,img_w_new)
             image_res = self.predict_enhancement(img_new)

From df3e16132526d63cb4c238c7ebc4876656196b59 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Mon, 1 Feb 2021 14:03:02 +0100
Subject: [PATCH 16/89] :art: cleanup, remove unused vars, use max over if-else

---
 sbb_newspapers_org_image/eynollah.py | 106 +++++++++------------------
 1 file changed, 33 insertions(+), 73 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index 6b71b32..4daa61d 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -1,4 +1,4 @@
-# pylint: disable=no-member
+# pylint: disable=no-member,invalid-name,line-too-long
 """
 tool to extract table form data from alto xml data
 """
@@ -419,18 +419,12 @@ class eynollah:
         del img_in
         del img_1ch
         del page_coord
-
         K.clear_session()
         gc.collect()
-
         print(dpi)
 
         if dpi < 298:
-
-            # sys.exit()
             img_new, num_column_is_classified = self.calculate_width_height_by_columns(img, num_col, width_early, label_p_pred)
-
-            # img_new=resize_image(img,img_h_new,img_w_new)
             image_res = self.predict_enhancement(img_new)
             # cv2.imwrite(os.path.join(self.dir_out, self.image_filename_stem) + ".tif",self.image)
             # self.image=self.image.astype(np.uint16)
@@ -480,7 +474,6 @@ class eynollah:
 
         return model, session
 
-
     def do_prediction(self, patches, img, model, marginal_of_patch_percent=0.1):
 
         img_height_model = model.layers[len(model.layers) - 1].output_shape[1]
@@ -495,48 +488,34 @@ class eynollah:
                 img = resize_image(img, img.shape[0], img_width_model)
 
             # print(img_height_model,img_width_model)
-            # margin = int(0.2 * img_width_model)
             margin = int(marginal_of_patch_percent * img_height_model)
-
             width_mid = img_width_model - 2 * margin
             height_mid = img_height_model - 2 * margin
-
             img = img / float(255.0)
-            # print(sys.getsizeof(img))
-            # print(np.max(img))
-
             img = img.astype(np.float16)
-
-            # print(sys.getsizeof(img))
-
             img_h = img.shape[0]
             img_w = img.shape[1]
-
             prediction_true = np.zeros((img_h, img_w, 3))
             mask_true = np.zeros((img_h, img_w))
             nxf = img_w / float(width_mid)
             nyf = img_h / float(height_mid)
-
             nxf = int(nxf) + 1 if nxf > int(nxf) else int(nxf)
             nyf = int(nyf) + 1 if nyf > int(nyf) else int(nyf)
 
             for i in range(nxf):
                 for j in range(nyf):
-
                     if i == 0:
                         index_x_d = i * width_mid
                         index_x_u = index_x_d + img_width_model
                     else:
                         index_x_d = i * width_mid
                         index_x_u = index_x_d + img_width_model
-
                     if j == 0:
                         index_y_d = j * height_mid
                         index_y_u = index_y_d + img_height_model
                     else:
                         index_y_d = j * height_mid
                         index_y_u = index_y_d + img_height_model
-
                     if index_x_u > img_w:
                         index_x_u = img_w
                         index_x_d = img_w - img_width_model
@@ -681,7 +660,7 @@ class eynollah:
         x, y, w, h = cv2.boundingRect(cnt)
 
         if x <= 30:
-            w = w + x
+            w += x
             x = 0
         if (self.image.shape[1] - (x + w)) <= 30:
             w = w + (self.image.shape[1] - (x + w))
@@ -717,33 +696,31 @@ class eynollah:
             model_region, session_region = self.start_new_session_and_model(self.model_region_dir_fully_np)
 
         if patches and cols == 1:
-            img2 = otsu_copy_binary(img)  # otsu_copy(img)
+            img2 = otsu_copy_binary(img)
             img2 = img2.astype(np.uint8)
             img2 = resize_image(img2, int(img_height_h * 0.7), int(img_width_h * 0.7))
-
             marginal_of_patch_percent = 0.1
             prediction_regions2 = self.do_prediction(patches, img2, model_region, marginal_of_patch_percent)
             prediction_regions2 = resize_image(prediction_regions2, img_height_h, img_width_h)
 
         if patches and cols == 2:
-            img2 = otsu_copy_binary(img)  # otsu_copy(img)
+            img2 = otsu_copy_binary(img)
             img2 = img2.astype(np.uint8)
             img2 = resize_image(img2, int(img_height_h * 0.4), int(img_width_h * 0.4))
-
             marginal_of_patch_percent = 0.1
             prediction_regions2 = self.do_prediction(patches, img2, model_region, marginal_of_patch_percent)
             prediction_regions2 = resize_image(prediction_regions2, img_height_h, img_width_h)
+
         elif patches and cols > 2:
-            img2 = otsu_copy_binary(img)  # otsu_copy(img)
+            img2 = otsu_copy_binary(img)
             img2 = img2.astype(np.uint8)
             img2 = resize_image(img2, int(img_height_h * 0.3), int(img_width_h * 0.3))
-
             marginal_of_patch_percent = 0.1
             prediction_regions2 = self.do_prediction(patches, img2, model_region, marginal_of_patch_percent)
             prediction_regions2 = resize_image(prediction_regions2, img_height_h, img_width_h)
 
         if patches and cols == 2:
-            img = otsu_copy_binary(img)  # otsu_copy(img)
+            img = otsu_copy_binary(img)
             img = img.astype(np.uint8)
             if img_width_h >= 2000:
                 img = resize_image(img, int(img_height_h * 0.9), int(img_width_h * 0.9))
@@ -752,60 +729,55 @@ class eynollah:
             img = img.astype(np.uint8)
 
         if patches and cols == 1:
-            img = otsu_copy_binary(img)  # otsu_copy(img)
+            img = otsu_copy_binary(img)
             img = img.astype(np.uint8)
             img = resize_image(img, int(img_height_h * 0.5), int(img_width_h * 0.5))
             img = img.astype(np.uint8)
 
-        if patches and cols==3:
-            if (self.scale_x==1 and img_width_h>3000) or (self.scale_x!=1 and img_width_h>2800):
-                img = otsu_copy_binary(img)#self.otsu_copy(img)
+        if patches and cols == 3:
+            if (self.scale_x == 1 and img_width_h > 3000) or (self.scale_x != 1 and img_width_h > 2800):
+                img = otsu_copy_binary(img)
                 img = img.astype(np.uint8)
-                #img= self.resize_image(img, int(img_height_h*0.8), int(img_width_h*0.8) )
-                img= resize_image(img, int(img_height_h*2800/float(img_width_h)), 2800 )
+                img = resize_image(img, int(img_height_h * 2800 / float(img_width_h)), 2800)
             else:
-                img = otsu_copy_binary(img)#self.otsu_copy(img)
+                img = otsu_copy_binary(img)
                 img = img.astype(np.uint8)
-                #img= self.resize_image(img, int(img_height_h*0.9), int(img_width_h*0.9) )
 
-        if patches and cols==4:
+        if patches and cols == 4:
             #print(self.scale_x,img_width_h,'scale')
-            if (self.scale_x==1 and img_width_h>4000) or (self.scale_x!=1 and img_width_h>3700):
-                img = otsu_copy_binary(img)#self.otsu_copy(img)
+            if (self.scale_x == 1 and img_width_h > 4000) or (self.scale_x != 1 and img_width_h > 3700):
+                img = otsu_copy_binary(img)
                 img = img.astype(np.uint8)
-                #img= self.resize_image(img, int(img_height_h*0.7), int(img_width_h*0.7) )
-                img= resize_image(img, int(img_height_h*3700/float(img_width_h)), 3700 )
+                img= resize_image(img, int(img_height_h * 3700 / float(img_width_h)), 3700)
             else:
                 img = otsu_copy_binary(img)#self.otsu_copy(img)
                 img = img.astype(np.uint8)
-                img= resize_image(img, int(img_height_h*0.9), int(img_width_h*0.9) )
+                img= resize_image(img, int(img_height_h * 0.9), int(img_width_h * 0.9))
 
         if patches and cols==5:
-            if (self.scale_x==1 and img_width_h>5000):
-                img = otsu_copy_binary(img)#self.otsu_copy(img)
+            if self.scale_x == 1 and img_width_h > 5000:
+                img = otsu_copy_binary(img)
                 img = img.astype(np.uint8)
-                img= resize_image(img, int(img_height_h*0.7), int(img_width_h*0.7) )
-                #img= self.resize_image(img, int(img_height_h*4700/float(img_width_h)), 4700 )
+                img= resize_image(img, int(img_height_h * 0.7), int(img_width_h * 0.7))
             else:
-                img = otsu_copy_binary(img)#self.otsu_copy(img)
+                img = otsu_copy_binary(img)
                 img = img.astype(np.uint8)
-                img= resize_image(img, int(img_height_h*0.9), int(img_width_h*0.9) )
+                img= resize_image(img, int(img_height_h * 0.9), int(img_width_h * 0.9) )
 
         if patches and cols>=6:
-            if img_width_h>5600:
-                img = otsu_copy_binary(img)#self.otsu_copy(img)
+            if img_width_h > 5600:
+                img = otsu_copy_binary(img)
                 img = img.astype(np.uint8)
-                #img= self.resize_image(img, int(img_height_h*0.7), int(img_width_h*0.7) )
-                img= resize_image(img, int(img_height_h*5600/float(img_width_h)), 5600 )
+                img= resize_image(img, int(img_height_h * 5600 / float(img_width_h)), 5600)
             else:
-                img = otsu_copy_binary(img)#self.otsu_copy(img)
+                img = otsu_copy_binary(img)
                 img = img.astype(np.uint8)
-                img= resize_image(img, int(img_height_h*0.9), int(img_width_h*0.9) )
+                img= resize_image(img, int(img_height_h * 0.9), int(img_width_h * 0.9))
 
         if not patches:
-            img = otsu_copy_binary(img)#self.otsu_copy(img)
+            img = otsu_copy_binary(img)
             img = img.astype(np.uint8)
-            prediction_regions2=None
+            prediction_regions2 = None
 
         marginal_of_patch_percent = 0.1
         prediction_regions = self.do_prediction(patches, img, model_region, marginal_of_patch_percent)
@@ -847,7 +819,6 @@ class eynollah:
 
         for i in range(num_cores):
             list_all_par = queue_of_all_params.get(True)
-
             slopes_for_sub_process = list_all_par[0]
             polys_for_sub_process = list_all_par[1]
             boxes_for_sub_process = list_all_par[2]
@@ -855,7 +826,6 @@ class eynollah:
             contours_par_for_subprocess = list_all_par[4]
             boxes_coord_for_subprocess = list_all_par[5]
             indexes_for_subprocess = list_all_par[6]
-
             for j in range(len(slopes_for_sub_process)):
                 slopes.append(slopes_for_sub_process[j])
                 all_found_texline_polygons.append(polys_for_sub_process[j])
@@ -899,7 +869,6 @@ class eynollah:
 
         for i in range(num_cores):
             list_all_par = queue_of_all_params.get(True)
-
             polys_for_sub_process = list_all_par[0]
             boxes_for_sub_process = list_all_par[1]
             contours_for_subprocess = list_all_par[2]
@@ -907,7 +876,6 @@ class eynollah:
             boxes_coord_for_subprocess = list_all_par[4]
             indexes_for_subprocess = list_all_par[5]
             slopes_for_sub_process = list_all_par[6]
-
             for j in range(len(polys_for_sub_process)):
                 slopes.append(slopes_for_sub_process[j])
                 all_found_texline_polygons.append(polys_for_sub_process[j])
@@ -930,7 +898,6 @@ class eynollah:
         contours_textregion_par_per_each_subprocess = []
         all_box_coord_per_process = []
         index_by_text_region_contours = []
-        slope_biggest = 0
 
         textline_cnt_seperated = np.zeros(textline_mask_tot_ea.shape)
 
@@ -938,9 +905,9 @@ class eynollah:
 
             all_text_region_raw = textline_mask_tot_ea[boxes_text[mv][1] : boxes_text[mv][1] + boxes_text[mv][3], boxes_text[mv][0] : boxes_text[mv][0] + boxes_text[mv][2]]
             all_text_region_raw = all_text_region_raw.astype(np.uint8)
-            img_int_p = all_text_region_raw[:, :]  # self.all_text_region_raw[mv]
+            img_int_p = all_text_region_raw[:, :]
 
-            ##img_int_p=cv2.erode(img_int_p,self.kernel,iterations = 2)
+            # img_int_p=cv2.erode(img_int_p,self.kernel,iterations = 2)
             # plt.imshow(img_int_p)
             # plt.show()
 
@@ -953,13 +920,9 @@ class eynollah:
                     textline_con, hierachy = return_contours_of_image(img_int_p)
                     textline_con_fil = filter_contours_area_of_image(img_int_p, textline_con, hierachy, max_area=1, min_area=0.0008)
                     y_diff_mean = find_contours_mean_y_diff(textline_con_fil)
-                    sigma_des = int(y_diff_mean * (4.0 / 40.0))
-
-                    if sigma_des < 1:
-                        sigma_des = 1
+                    sigma_des = max(1, int(y_diff_mean * (4.0 / 40.0)))
 
                     img_int_p[img_int_p > 0] = 1
-                    # slope_for_all=self.return_deskew_slope_new(img_int_p,sigma_des)
                     slope_for_all = return_deskew_slop(img_int_p, sigma_des, dir_of_all=self.dir_of_all, image_filename_stem=self.image_filename_stem)
 
                     if abs(slope_for_all) < 0.5:
@@ -978,9 +941,7 @@ class eynollah:
             crop_img, crop_coor = crop_image_inside_box(boxes_text[mv], image_page_rotated)
 
             if abs(slope_for_all) < 45:
-
                 # all_box_coord.append(crop_coor)
-
                 textline_region_in_image = np.zeros(textline_mask_tot_ea.shape)
                 cnt_o_t_max = contours_par_per_process[mv]
                 x, y, w, h = cv2.boundingRect(cnt_o_t_max)
@@ -1049,7 +1010,6 @@ class eynollah:
         contours_textregion_par_per_each_subprocess = []
         all_box_coord_per_process = []
         index_by_text_region_contours = []
-        slope_biggest = 0
 
         for mv in range(len(boxes_text)):
             crop_img,crop_coor=crop_image_inside_box(boxes_text[mv],image_page_rotated)

From 58c5d4dce6ccec6b6969f1bde9c678185058b834 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Tue, 2 Feb 2021 14:12:03 +0100
Subject: [PATCH 17/89] eliminate unused "slope_first" arg to
 textline_contours_postprocessing

---
 sbb_newspapers_org_image/eynollah.py             | 9 +++------
 sbb_newspapers_org_image/utils/separate_lines.py | 2 +-
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index 4daa61d..aecdeaf 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -913,7 +913,6 @@ class eynollah:
 
             if img_int_p.shape[0] / img_int_p.shape[1] < 0.1:
                 slopes_per_each_subprocess.append(0)
-                slope_first = 0
                 slope_for_all = [slope_deskew][0]
             else:
                 try:
@@ -987,9 +986,8 @@ class eynollah:
                     except:
                         pass
             else:
-                slope_first = 0
                 add_boxes_coor_into_textlines = True
-                textlines_cnt_per_region = textline_contours_postprocessing(all_text_region_raw, slope_for_all, contours_par_per_process[mv], boxes_text[mv], slope_first, add_boxes_coor_into_textlines)
+                textlines_cnt_per_region = textline_contours_postprocessing(all_text_region_raw, slope_for_all, contours_par_per_process[mv], boxes_text[mv], add_boxes_coor_into_textlines)
                 add_boxes_coor_into_textlines = False
                 # print(np.shape(textlines_cnt_per_region),'textlines_cnt_per_region')
 
@@ -1047,7 +1045,6 @@ class eynollah:
                 if slope_for_all == 999:
                     slope_for_all = [slope_deskew][0]
                 slopes_per_each_subprocess.append(slope_for_all)
-                slope_first = 0
                 mask_only_con_region = np.zeros(textline_mask_tot_ea.shape)
                 mask_only_con_region = cv2.fillPoly(mask_only_con_region, pts=[contours_par_per_process[mv]], color=(1, 1, 1))
 
@@ -1064,7 +1061,7 @@ class eynollah:
                 ##plt.show()
 
                 all_text_region_raw[mask_only_con_region == 0] = 0
-                cnt_clean_rot = textline_contours_postprocessing(all_text_region_raw, slope_for_all, contours_par_per_process[mv], boxes_text[mv], slope_first)
+                cnt_clean_rot = textline_contours_postprocessing(all_text_region_raw, slope_for_all, contours_par_per_process[mv], boxes_text[mv])
 
                 textlines_rectangles_per_each_subprocess.append(cnt_clean_rot)
                 index_by_text_region_contours.append(indexes_r_con_per_pro[mv])
@@ -2334,7 +2331,7 @@ class eynollah:
                 ##sys.exit()
                 print("deskewing: " + str(time.time() - t1))
 
-                image_page_rotated, textline_mask_tot = image_page[:, :], textline_mask_tot_ea[:, :]  # rotation_not_90_func(image_page,textline_mask_tot_ea,slope_first)
+                image_page_rotated, textline_mask_tot = image_page[:, :], textline_mask_tot_ea[:, :]
                 textline_mask_tot[mask_images[:, :] == 1] = 0
 
                 pixel_img = 1
diff --git a/sbb_newspapers_org_image/utils/separate_lines.py b/sbb_newspapers_org_image/utils/separate_lines.py
index 7452c4e..a7b0b90 100644
--- a/sbb_newspapers_org_image/utils/separate_lines.py
+++ b/sbb_newspapers_org_image/utils/separate_lines.py
@@ -1395,7 +1395,7 @@ def seperate_lines_vertical_cont(img_patch, contour_text_interest, thetha, box_i
     return None, cont_final
 
 
-def textline_contours_postprocessing(textline_mask, slope, contour_text_interest, box_ind, slope_first, add_boxes_coor_into_textlines=False):
+def textline_contours_postprocessing(textline_mask, slope, contour_text_interest, box_ind, add_boxes_coor_into_textlines=False):
 
     textline_mask = np.repeat(textline_mask[:, :, np.newaxis], 3, axis=2) * 255
     textline_mask = textline_mask.astype(np.uint8)

From 4a903d2ec3675fc2c3866bd5a0dfd230305a88e1 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Tue, 2 Feb 2021 14:33:36 +0100
Subject: [PATCH 18/89] utils.find_num_col: clean up and simplify if-else

---
 sbb_newspapers_org_image/utils/__init__.py | 40 ++--------------------
 1 file changed, 3 insertions(+), 37 deletions(-)

diff --git a/sbb_newspapers_org_image/utils/__init__.py b/sbb_newspapers_org_image/utils/__init__.py
index 4feb7e0..cc96297 100644
--- a/sbb_newspapers_org_image/utils/__init__.py
+++ b/sbb_newspapers_org_image/utils/__init__.py
@@ -557,57 +557,35 @@ def return_hor_spliter_by_index_for_without_verticals(peaks_neg_fin_t, x_min_hor
 
 def find_num_col(regions_without_seperators, multiplier=3.8):
     regions_without_seperators_0 = regions_without_seperators[:, :].sum(axis=0)
-
     ##plt.plot(regions_without_seperators_0)
     ##plt.show()
-
     sigma_ = 35  # 70#35
-
     meda_n_updown = regions_without_seperators_0[len(regions_without_seperators_0) :: -1]
-
     first_nonzero = next((i for i, x in enumerate(regions_without_seperators_0) if x), 0)
     last_nonzero = next((i for i, x in enumerate(meda_n_updown) if x), 0)
-
-    # print(last_nonzero)
-    # print(isNaN(last_nonzero))
-    # last_nonzero=0#halalikh
     last_nonzero = len(regions_without_seperators_0) - last_nonzero
-
     y = regions_without_seperators_0  # [first_nonzero:last_nonzero]
-
     y_help = np.zeros(len(y) + 20)
-
     y_help[10 : len(y) + 10] = y
-
     x = np.array(range(len(y)))
-
     zneg_rev = -y_help + np.max(y_help)
-
     zneg = np.zeros(len(zneg_rev) + 20)
-
     zneg[10 : len(zneg_rev) + 10] = zneg_rev
-
     z = gaussian_filter1d(y, sigma_)
     zneg = gaussian_filter1d(zneg, sigma_)
 
     peaks_neg, _ = find_peaks(zneg, height=0)
     peaks, _ = find_peaks(z, height=0)
-
     peaks_neg = peaks_neg - 10 - 10
 
     last_nonzero = last_nonzero - 100
     first_nonzero = first_nonzero + 200
 
     peaks_neg = peaks_neg[(peaks_neg > first_nonzero) & (peaks_neg < last_nonzero)]
-
     peaks = peaks[(peaks > 0.06 * regions_without_seperators.shape[1]) & (peaks < 0.94 * regions_without_seperators.shape[1])]
     peaks_neg = peaks_neg[(peaks_neg > 370) & (peaks_neg < (regions_without_seperators.shape[1] - 370))]
-
-    # print(peaks)
     interest_pos = z[peaks]
-
     interest_pos = interest_pos[interest_pos > 10]
-
     # plt.plot(z)
     # plt.show()
     interest_neg = z[peaks_neg]
@@ -621,9 +599,7 @@ def find_num_col(regions_without_seperators, multiplier=3.8):
     min_peaks_neg = 0  # np.min(interest_neg)
 
     # print(np.min(interest_pos),np.max(interest_pos),np.max(interest_pos)/np.min(interest_pos),'minmax')
-    # $print(min_peaks_pos)
     dis_talaei = (min_peaks_pos - min_peaks_neg) / multiplier
-    # print(interest_pos)
     grenze = min_peaks_pos - dis_talaei  # np.mean(y[peaks_neg[0]:peaks_neg[len(peaks_neg)-1]])-np.std(y[peaks_neg[0]:peaks_neg[len(peaks_neg)-1]])/2.0
 
     # print(interest_neg,'interest_neg')
@@ -650,15 +626,11 @@ def find_num_col(regions_without_seperators, multiplier=3.8):
         if (peaks_neg_fin[0] > p_g_u and peaks_neg_fin[1] > p_g_u) or (peaks_neg_fin[0] < p_g_l and peaks_neg_fin[1] < p_g_l) or ((peaks_neg_fin[0] + 200) < p_m and peaks_neg_fin[1] < p_m) or ((peaks_neg_fin[0] - 200) > p_m and peaks_neg_fin[1] > p_m):
             num_col = 1
             peaks_neg_fin = []
-        else:
-            pass
 
     if num_col == 2:
         if (peaks_neg_fin[0] > p_g_u) or (peaks_neg_fin[0] < p_g_l):
             num_col = 1
             peaks_neg_fin = []
-        else:
-            pass
 
     ##print(len(peaks_neg_fin))
 
@@ -673,7 +645,7 @@ def find_num_col(regions_without_seperators, multiplier=3.8):
     for i in range(len(peaks_neg_fin)):
         if i == 0:
             forest.append(peaks_neg_fin[i])
-        if i < (len(peaks_neg_fin) - 1):
+        if i < len(peaks_neg_fin) - 1:
             if diff_peaks[i] <= cut_off:
                 forest.append(peaks_neg_fin[i + 1])
             if diff_peaks[i] > cut_off:
@@ -687,7 +659,7 @@ def find_num_col(regions_without_seperators, multiplier=3.8):
             if not isNaN(forest[np.argmin(z[forest])]):
                 peaks_neg_true.append(forest[np.argmin(z[forest])])
 
-    num_col = (len(peaks_neg_true)) + 1
+    num_col = len(peaks_neg_true) + 1
     p_l = 0
     p_u = len(y) - 1
     p_m = int(len(y) / 2.0)
@@ -706,15 +678,11 @@ def find_num_col(regions_without_seperators, multiplier=3.8):
             peaks_neg_true = [peaks_neg_true[0]]
         elif (peaks_neg_true[1] < p_g_u and peaks_neg_true[1] > p_g_l) and (peaks_neg_true[0] < p_quarter):
             peaks_neg_true = [peaks_neg_true[1]]
-        else:
-            pass
 
     if num_col == 2:
         if (peaks_neg_true[0] > p_g_u) or (peaks_neg_true[0] < p_g_l):
             num_col = 1
             peaks_neg_true = []
-        else:
-            pass
 
     diff_peaks_annormal = diff_peaks[diff_peaks < 360]
 
@@ -732,9 +700,7 @@ def find_num_col(regions_without_seperators, multiplier=3.8):
                 else:
                     peaks_neg_fin_new.append(peaks_neg_fin[ii + 1])
 
-            elif (ii - 1) in arg_help_ann:
-                pass
-            else:
+            elif (ii - 1) not in arg_help_ann:
                 peaks_neg_fin_new.append(peaks_neg_fin[ii])
     else:
         peaks_neg_fin_new = peaks_neg_fin

From 853fd12e403b75716fb7a1a291c1a949ad8f8a6a Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Tue, 2 Feb 2021 14:34:25 +0100
Subject: [PATCH 19/89] extract matplotlib code to method

---
 sbb_newspapers_org_image/eynollah.py | 59 ++++++++--------------------
 1 file changed, 17 insertions(+), 42 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index aecdeaf..859f22a 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -1,4 +1,4 @@
-# pylint: disable=no-member,invalid-name,line-too-long
+# pylint: disable=no-member,invalid-name,line-too-long,missing-function-docstring
 """
 tool to extract table form data from alto xml data
 """
@@ -2112,14 +2112,11 @@ class eynollah:
 
     def save_plot_of_layout_main(self, text_regions_p, image_page):
         values = np.unique(text_regions_p[:, :])
-
         # pixels=['Background' , 'Main text' , 'Heading' , 'Marginalia' ,'Drop capitals' , 'Images' , 'Seperators' , 'Tables', 'Graphics']
-
         pixels=['Background' , 'Main text'  , 'Image' , 'Separator','Marginalia']
         values_indexes = [0, 1, 2, 3, 4]
         plt.figure(figsize=(40, 40))
         plt.rcParams["font.size"] = "40"
-
         im = plt.imshow(text_regions_p[:, :])
         colors = [im.cmap(im.norm(value)) for value in values]
         patches = [mpatches.Patch(color=colors[np.where(values == i)[0][0]], label="{l}".format(l=pixels[int(np.where(values_indexes == i)[0][0])])) for i in values]
@@ -2128,12 +2125,9 @@ class eynollah:
 
     def save_plot_of_layout_main_all(self, text_regions_p, image_page):
         values = np.unique(text_regions_p[:, :])
-
         # pixels=['Background' , 'Main text' , 'Heading' , 'Marginalia' ,'Drop capitals' , 'Images' , 'Seperators' , 'Tables', 'Graphics']
-
         pixels=['Background' , 'Main text'  , 'Image' , 'Separator','Marginalia']
         values_indexes = [0, 1, 2, 3, 4]
-
         plt.figure(figsize=(80, 40))
         plt.rcParams["font.size"] = "40"
         plt.subplot(1, 2, 1)
@@ -2143,14 +2137,11 @@ class eynollah:
         colors = [im.cmap(im.norm(value)) for value in values]
         patches = [mpatches.Patch(color=colors[np.where(values == i)[0][0]], label="{l}".format(l=pixels[int(np.where(values_indexes == i)[0][0])])) for i in values]
         plt.legend(handles=patches, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0, fontsize=60)
-
         plt.savefig(os.path.join(self.dir_of_all, self.image_filename_stem + "_layout_main_and_page.png"))
 
     def save_plot_of_layout(self, text_regions_p, image_page):
         values = np.unique(text_regions_p[:, :])
-
         # pixels=['Background' , 'Main text' , 'Heading' , 'Marginalia' ,'Drop capitals' , 'Images' , 'Seperators' , 'Tables', 'Graphics']
-
         pixels = ["Background", "Main text", "Header", "Marginalia", "Drop capital", "Image", "Separator"]
         values_indexes = [0, 1, 2, 8, 4, 5, 6]
         plt.figure(figsize=(40, 40))
@@ -2163,12 +2154,9 @@ class eynollah:
 
     def save_plot_of_layout_all(self, text_regions_p, image_page):
         values = np.unique(text_regions_p[:, :])
-
         # pixels=['Background' , 'Main text' , 'Heading' , 'Marginalia' ,'Drop capitals' , 'Images' , 'Seperators' , 'Tables', 'Graphics']
-
         pixels = ["Background", "Main text", "Header", "Marginalia", "Drop capital", "Image", "Separator"]
         values_indexes = [0, 1, 2, 8, 4, 5, 6]
-
         plt.figure(figsize=(80, 40))
         plt.rcParams["font.size"] = "40"
         plt.subplot(1, 2, 1)
@@ -2178,9 +2166,23 @@ class eynollah:
         colors = [im.cmap(im.norm(value)) for value in values]
         patches = [mpatches.Patch(color=colors[np.where(values == i)[0][0]], label="{l}".format(l=pixels[int(np.where(values_indexes == i)[0][0])])) for i in values]
         plt.legend(handles=patches, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0, fontsize=60)
-
         plt.savefig(os.path.join(self.dir_of_all, self.image_filename_stem + "_layout_and_page.png"))
 
+    def save_plot_of_textlines(self, textline_mask_tot_ea, image_page):
+        values = np.unique(textline_mask_tot_ea[:, :])
+        pixels = ["Background", "Textlines"]
+        values_indexes = [0, 1]
+        plt.figure(figsize=(80, 40))
+        plt.rcParams["font.size"] = "40"
+        plt.subplot(1, 2, 1)
+        plt.imshow(image_page)
+        plt.subplot(1, 2, 2)
+        im = plt.imshow(textline_mask_tot_ea[:, :])
+        colors = [im.cmap(im.norm(value)) for value in values]
+        patches = [mpatches.Patch(color=colors[np.where(values == i)[0][0]], label="{l}".format(l=pixels[int(np.where(values_indexes == i)[0][0])])) for i in values]
+        plt.legend(handles=patches, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0, fontsize=60)
+        plt.savefig(os.path.join(self.dir_of_all, self.image_filename_stem + "_textline_and_page.png"))
+
     def save_deskewed_image(self, slope_deskew):
         img_rotated = rotyate_image_different(self.image_org, slope_deskew)
 
@@ -2297,20 +2299,7 @@ class eynollah:
                 gc.collect()
                 #print(np.unique(textline_mask_tot_ea[:, :]), "textline")
                 if self.dir_of_all is not None:
-                    values = np.unique(textline_mask_tot_ea[:, :])
-                    pixels = ["Background", "Textlines"]
-                    values_indexes = [0, 1]
-                    plt.figure(figsize=(80, 40))
-                    plt.rcParams["font.size"] = "40"
-                    plt.subplot(1, 2, 1)
-                    plt.imshow(image_page)
-                    plt.subplot(1, 2, 2)
-                    im = plt.imshow(textline_mask_tot_ea[:, :])
-                    colors = [im.cmap(im.norm(value)) for value in values]
-                    patches = [mpatches.Patch(color=colors[np.where(values == i)[0][0]], label="{l}".format(l=pixels[int(np.where(values_indexes == i)[0][0])])) for i in values]
-                    plt.legend(handles=patches, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0, fontsize=60)
-
-                    plt.savefig(os.path.join(self.dir_of_all, self.image_filename_stem + "_textline_and_page.png"))
+                    self.save_plot_of_textlines(textline_mask_tot_ea, image_page)
                 print("textline: " + str(time.time() - t1))
                 # plt.imshow(textline_mask_tot_ea)
                 # plt.show()
@@ -2729,18 +2718,4 @@ class eynollah:
                     # order_text_new , id_of_texts_tot=self.do_order_of_regions(contours_only_text_parent,contours_only_text_parent_h,boxes,textline_mask_tot)
                     self.write_into_page_xml(txt_con_org, page_coord, self.dir_out, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, self.curved_line, slopes, slopes_marginals)
 
-            ##except:
-                ##txt_con_org = []
-                ##order_text_new = []
-                ##id_of_texts_tot = []
-                ##all_found_texline_polygons = []
-                ##all_box_coord = []
-                ##polygons_of_images = []
-                ##polygons_of_marginals = []
-                ##all_found_texline_polygons_marginals = []
-                ##all_box_coord_marginals = []
-                ##slopes = []
-                ##slopes_marginals = []
-                ##self.write_into_page_xml(txt_con_org, page_coord, self.dir_out, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, self.curved_line, slopes, slopes_marginals)
-
         print("Job done in: " + str(time.time() - t1))

From c2e9ebb366592a1069811a652b39a4e8299da0e7 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Tue, 2 Feb 2021 15:36:43 +0100
Subject: [PATCH 20/89] move all plotting code to EynollahPlotter

---
 sbb_newspapers_org_image/cli.py               |  15 +-
 sbb_newspapers_org_image/eynollah.py          | 163 ++++--------------
 sbb_newspapers_org_image/plot.py              | 159 +++++++++++++++++
 sbb_newspapers_org_image/utils/__init__.py    | 105 +----------
 .../utils/separate_lines.py                   |  43 +----
 5 files changed, 219 insertions(+), 266 deletions(-)
 create mode 100644 sbb_newspapers_org_image/plot.py

diff --git a/sbb_newspapers_org_image/cli.py b/sbb_newspapers_org_image/cli.py
index 03d0167..c9fd772 100644
--- a/sbb_newspapers_org_image/cli.py
+++ b/sbb_newspapers_org_image/cli.py
@@ -4,13 +4,18 @@ from sbb_newspapers_org_image.eynollah import eynollah
 
 @click.command()
 @click.option(
-    "--image", "-i", help="image filename", type=click.Path(exists=True, dir_okay=False)
+    "--image",
+    "-i",
+    help="image filename",
+    type=click.Path(exists=True, dir_okay=False),
+    required=True,
 )
 @click.option(
     "--out",
     "-o",
     help="directory to write output xml data",
     type=click.Path(exists=True, file_okay=False),
+    required=True,
 )
 @click.option(
     "--model",
@@ -42,6 +47,12 @@ from sbb_newspapers_org_image.eynollah import eynollah
     help="if a directory is given, all plots needed for documentation will be saved there",
     type=click.Path(exists=True, file_okay=False),
 )
+@click.option(
+    "--enable_plotting",
+    "-ep",
+    is_flag=True,
+    help="If set, will plot intermediary files and images",
+)
 @click.option(
     "--allow_enhancement",
     "-ae",
@@ -80,6 +91,7 @@ def main(
     save_layout,
     save_deskewed,
     save_all,
+    enable_plotting,
     allow_enhancement,
     curved_line,
     full_layout,
@@ -95,6 +107,7 @@ def main(
         save_layout,
         save_deskewed,
         save_all,
+        enable_plotting,
         allow_enhancement,
         curved_line,
         full_layout,
diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index 859f22a..baf9f92 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -12,11 +12,9 @@ import time
 import warnings
 from pathlib import Path
 from multiprocessing import Process, Queue, cpu_count
-from sys import getsizeof
 
 import cv2
 import numpy as np
-import matplotlib.pyplot as plt
 
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
 stderr = sys.stderr
@@ -117,10 +115,10 @@ from .utils import (
 
 from .utils.xml import create_page_xml
 
+from .plot import EynollahPlotter
 
 SLOPE_THRESHOLD = 0.13
 
-
 class eynollah:
     def __init__(
         self,
@@ -132,6 +130,7 @@ class eynollah:
         dir_of_layout=None,
         dir_of_deskewed=None,
         dir_of_all=None,
+        enable_plotting=False,
         allow_enhancement=False,
         curved_line=False,
         full_layout=False,
@@ -142,17 +141,21 @@ class eynollah:
         self.cont_page = []
         self.dir_out = dir_out
         self.image_filename_stem = image_filename_stem
-        self.dir_of_cropped_images = dir_of_cropped_images
         self.allow_enhancement = allow_enhancement
         self.curved_line = curved_line
         self.full_layout = full_layout
         self.allow_scaling = allow_scaling
-        self.dir_of_layout = dir_of_layout
         self.headers_off = headers_off
-        self.dir_of_deskewed = dir_of_deskewed
-        self.dir_of_all = dir_of_all
         if not self.image_filename_stem:
             self.image_filename_stem = Path(Path(image_filename).name).stem
+        self.plotter = None if not enable_plotting else EynollahPlotter(
+            dir_of_all=dir_of_all,
+            dir_of_deskewed=dir_of_deskewed,
+            dir_of_cropped_images=dir_of_cropped_images,
+            dir_of_layout=dir_of_layout,
+            image_filename=image_filename,
+            image_filename_stem=image_filename_stem,
+        )
         self.dir_models = dir_models
         self.kernel = np.ones((5, 5), np.uint8)
 
@@ -448,8 +451,12 @@ class eynollah:
         self.scale_x = self.img_width_int / float(self.image.shape[1])
 
         self.image = resize_image(self.image, self.img_hight_int, self.img_width_int)
-        del img_res
-        del img_org
+
+        # Also set for the plotter
+        # XXX TODO hacky
+        self.plotter.image_org = self.image_org
+        self.plotter.scale_y = self.scale_y
+        self.plotter.scale_x = self.scale_x
 
     def get_image_and_scales_after_enhancing(self, img_org, img_res):
 
@@ -922,7 +929,7 @@ class eynollah:
                     sigma_des = max(1, int(y_diff_mean * (4.0 / 40.0)))
 
                     img_int_p[img_int_p > 0] = 1
-                    slope_for_all = return_deskew_slop(img_int_p, sigma_des, dir_of_all=self.dir_of_all, image_filename_stem=self.image_filename_stem)
+                    slope_for_all = return_deskew_slop(img_int_p, sigma_des, plotter=self.plotter)
 
                     if abs(slope_for_all) < 0.5:
                         slope_for_all = [slope_deskew][0]
@@ -950,7 +957,7 @@ class eynollah:
                 textline_biggest_region = mask_biggest * textline_mask_tot_ea
 
                 # print(slope_for_all,'slope_for_all')
-                textline_rotated_seperated = seperate_lines_new2(textline_biggest_region[y : y + h, x : x + w], 0, num_col, slope_for_all, self.dir_of_all, self.image_filename_stem)
+                textline_rotated_seperated = seperate_lines_new2(textline_biggest_region[y : y + h, x : x + w], 0, num_col, slope_for_all, plotter=self.plotter)
 
                 # new line added
                 ##print(np.shape(textline_rotated_seperated),np.shape(mask_biggest))
@@ -1036,7 +1043,7 @@ class eynollah:
                     if sigma_des < 1:
                         sigma_des = 1
                     img_int_p[img_int_p > 0] = 1
-                    slope_for_all = return_deskew_slop(img_int_p, sigma_des, dir_of_all=self.dir_of_all, image_filename_stem=self.image_filename_stem)
+                    slope_for_all = return_deskew_slop(img_int_p, sigma_des, plotter=self.plotter)
                     if abs(slope_for_all) <= 0.5:
                         slope_for_all = [slope_deskew][0]
                 except:
@@ -1127,7 +1134,7 @@ class eynollah:
                     sigma_des = 1
 
                 crop_img[crop_img > 0] = 1
-                slope_corresponding_textregion = return_deskew_slop(crop_img, sigma_des, dir_of_all=self.dir_of_all, image_filename_stem=self.image_filename_stem)
+                slope_corresponding_textregion = return_deskew_slop(crop_img, sigma_des, plotter=self.plotter)
 
             except:
                 slope_corresponding_textregion = 999
@@ -1814,20 +1821,6 @@ class eynollah:
 
         return text_regions_p_true
 
-
-    def write_images_into_directory(self, img_contoures, dir_of_cropped_imgs, image_page):
-        index = 0
-        for cont_ind in img_contoures:
-            x, y, w, h = cv2.boundingRect(cont_ind)
-            box = [x, y, w, h]
-            croped_page, page_coord = crop_image_inside_box(box, image_page)
-
-            croped_page = resize_image(croped_page, int(croped_page.shape[0] / self.scale_y), int(croped_page.shape[1] / self.scale_x))
-
-            path = os.path.join(dir_of_cropped_imgs, self.image_filename_stem + "_" + str(index) + ".jpg")
-            cv2.imwrite(path, croped_page)
-            index += 1
-
     def do_order_of_regions(self, contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot):
 
         if self.full_layout:
@@ -2110,88 +2103,6 @@ class eynollah:
 
             return order_text_new, id_of_texts_tot
 
-    def save_plot_of_layout_main(self, text_regions_p, image_page):
-        values = np.unique(text_regions_p[:, :])
-        # pixels=['Background' , 'Main text' , 'Heading' , 'Marginalia' ,'Drop capitals' , 'Images' , 'Seperators' , 'Tables', 'Graphics']
-        pixels=['Background' , 'Main text'  , 'Image' , 'Separator','Marginalia']
-        values_indexes = [0, 1, 2, 3, 4]
-        plt.figure(figsize=(40, 40))
-        plt.rcParams["font.size"] = "40"
-        im = plt.imshow(text_regions_p[:, :])
-        colors = [im.cmap(im.norm(value)) for value in values]
-        patches = [mpatches.Patch(color=colors[np.where(values == i)[0][0]], label="{l}".format(l=pixels[int(np.where(values_indexes == i)[0][0])])) for i in values]
-        plt.legend(handles=patches, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0, fontsize=40)
-        plt.savefig(os.path.join(self.dir_of_layout, self.image_filename_stem + "_layout_main.png"))
-
-    def save_plot_of_layout_main_all(self, text_regions_p, image_page):
-        values = np.unique(text_regions_p[:, :])
-        # pixels=['Background' , 'Main text' , 'Heading' , 'Marginalia' ,'Drop capitals' , 'Images' , 'Seperators' , 'Tables', 'Graphics']
-        pixels=['Background' , 'Main text'  , 'Image' , 'Separator','Marginalia']
-        values_indexes = [0, 1, 2, 3, 4]
-        plt.figure(figsize=(80, 40))
-        plt.rcParams["font.size"] = "40"
-        plt.subplot(1, 2, 1)
-        plt.imshow(image_page)
-        plt.subplot(1, 2, 2)
-        im = plt.imshow(text_regions_p[:, :])
-        colors = [im.cmap(im.norm(value)) for value in values]
-        patches = [mpatches.Patch(color=colors[np.where(values == i)[0][0]], label="{l}".format(l=pixels[int(np.where(values_indexes == i)[0][0])])) for i in values]
-        plt.legend(handles=patches, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0, fontsize=60)
-        plt.savefig(os.path.join(self.dir_of_all, self.image_filename_stem + "_layout_main_and_page.png"))
-
-    def save_plot_of_layout(self, text_regions_p, image_page):
-        values = np.unique(text_regions_p[:, :])
-        # pixels=['Background' , 'Main text' , 'Heading' , 'Marginalia' ,'Drop capitals' , 'Images' , 'Seperators' , 'Tables', 'Graphics']
-        pixels = ["Background", "Main text", "Header", "Marginalia", "Drop capital", "Image", "Separator"]
-        values_indexes = [0, 1, 2, 8, 4, 5, 6]
-        plt.figure(figsize=(40, 40))
-        plt.rcParams["font.size"] = "40"
-        im = plt.imshow(text_regions_p[:, :])
-        colors = [im.cmap(im.norm(value)) for value in values]
-        patches = [mpatches.Patch(color=colors[np.where(values == i)[0][0]], label="{l}".format(l=pixels[int(np.where(values_indexes == i)[0][0])])) for i in values]
-        plt.legend(handles=patches, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0, fontsize=40)
-        plt.savefig(os.path.join(self.dir_of_layout, self.image_filename_stem + "_layout.png"))
-
-    def save_plot_of_layout_all(self, text_regions_p, image_page):
-        values = np.unique(text_regions_p[:, :])
-        # pixels=['Background' , 'Main text' , 'Heading' , 'Marginalia' ,'Drop capitals' , 'Images' , 'Seperators' , 'Tables', 'Graphics']
-        pixels = ["Background", "Main text", "Header", "Marginalia", "Drop capital", "Image", "Separator"]
-        values_indexes = [0, 1, 2, 8, 4, 5, 6]
-        plt.figure(figsize=(80, 40))
-        plt.rcParams["font.size"] = "40"
-        plt.subplot(1, 2, 1)
-        plt.imshow(image_page)
-        plt.subplot(1, 2, 2)
-        im = plt.imshow(text_regions_p[:, :])
-        colors = [im.cmap(im.norm(value)) for value in values]
-        patches = [mpatches.Patch(color=colors[np.where(values == i)[0][0]], label="{l}".format(l=pixels[int(np.where(values_indexes == i)[0][0])])) for i in values]
-        plt.legend(handles=patches, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0, fontsize=60)
-        plt.savefig(os.path.join(self.dir_of_all, self.image_filename_stem + "_layout_and_page.png"))
-
-    def save_plot_of_textlines(self, textline_mask_tot_ea, image_page):
-        values = np.unique(textline_mask_tot_ea[:, :])
-        pixels = ["Background", "Textlines"]
-        values_indexes = [0, 1]
-        plt.figure(figsize=(80, 40))
-        plt.rcParams["font.size"] = "40"
-        plt.subplot(1, 2, 1)
-        plt.imshow(image_page)
-        plt.subplot(1, 2, 2)
-        im = plt.imshow(textline_mask_tot_ea[:, :])
-        colors = [im.cmap(im.norm(value)) for value in values]
-        patches = [mpatches.Patch(color=colors[np.where(values == i)[0][0]], label="{l}".format(l=pixels[int(np.where(values_indexes == i)[0][0])])) for i in values]
-        plt.legend(handles=patches, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0, fontsize=60)
-        plt.savefig(os.path.join(self.dir_of_all, self.image_filename_stem + "_textline_and_page.png"))
-
-    def save_deskewed_image(self, slope_deskew):
-        img_rotated = rotyate_image_different(self.image_org, slope_deskew)
-
-        if self.dir_of_all is not None:
-            cv2.imwrite(os.path.join(self.dir_of_all, self.image_filename_stem + "_org.png"), self.image_org)
-
-        cv2.imwrite(os.path.join(self.dir_of_deskewed, self.image_filename_stem + "_deskewed.png"), img_rotated)
-        del img_rotated
-
     def run(self):
         is_image_enhanced = False
         # get image and sclaes, then extract the page of scanned image
@@ -2243,8 +2154,8 @@ class eynollah:
 
         image_page, page_coord = self.extract_page()
         # print(image_page.shape,'page')
-        if self.dir_of_all is not None:
-            cv2.imwrite(os.path.join(self.dir_of_all, self.image_filename_stem + "_page.png"), image_page)
+        if self.plotter:
+            self.plotter.save_page_image(image_page)
         K.clear_session()
         gc.collect()
 
@@ -2298,8 +2209,8 @@ class eynollah:
                 K.clear_session()
                 gc.collect()
                 #print(np.unique(textline_mask_tot_ea[:, :]), "textline")
-                if self.dir_of_all is not None:
-                    self.save_plot_of_textlines(textline_mask_tot_ea, image_page)
+                if self.plotter:
+                    self.plotter.save_plot_of_textlines(textline_mask_tot_ea, image_page)
                 print("textline: " + str(time.time() - t1))
                 # plt.imshow(textline_mask_tot_ea)
                 # plt.show()
@@ -2307,11 +2218,11 @@ class eynollah:
 
                 sigma = 2
                 main_page_deskew = True
-                slope_deskew = return_deskew_slop(cv2.erode(textline_mask_tot_ea, self.kernel, iterations=2), sigma, main_page_deskew, dir_of_all=self.dir_of_all, image_filename_stem=self.image_filename_stem)
-                slope_first = 0  # return_deskew_slop(cv2.erode(textline_mask_tot_ea, self.kernel, iterations=2),sigma, dir_of_all=self.dir_of_all, image_filename_stem=self.image_filename_stem)
+                slope_deskew = return_deskew_slop(cv2.erode(textline_mask_tot_ea, self.kernel, iterations=2), sigma, main_page_deskew, plotter=self.plotter)
+                slope_first = 0  # return_deskew_slop(cv2.erode(textline_mask_tot_ea, self.kernel, iterations=2),sigma, plotter=self.plotter)
 
-                if self.dir_of_deskewed is not None:
-                    self.save_deskewed_image(slope_deskew)
+                if self.plotter:
+                    self.plotter.save_deskewed_image(slope_deskew)
                 # img_rotated=rotyate_image_different(self.image_org,slope_deskew)
                 print(slope_deskew, "slope_deskew")
 
@@ -2344,10 +2255,9 @@ class eynollah:
                 # plt.imshow(text_regions_p)
                 # plt.show()
 
-                if self.dir_of_all is not None:
-                    self.save_plot_of_layout_main_all(text_regions_p, image_page)
-                if self.dir_of_layout is not None:
-                    self.save_plot_of_layout_main(text_regions_p, image_page)
+                if self.plotter:
+                    self.plotter.save_plot_of_layout_main_all(text_regions_p, image_page)
+                    self.plotter.save_plot_of_layout_main(text_regions_p, image_page)
 
                 print("marginals: " + str(time.time() - t1))
 
@@ -2632,10 +2542,9 @@ class eynollah:
                         contours_only_text_parent_d_ordered = None
                         text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_texline_polygons, all_found_texline_polygons_h, slopes, slopes_h, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_texline_polygons, slopes, contours_only_text_parent_d_ordered)
 
-                    if self.dir_of_layout is not None:
-                        self.save_plot_of_layout(text_regions_p, image_page)
-                    if self.dir_of_all is not None:
-                        self.save_plot_of_layout_all(text_regions_p, image_page)
+                    if self.plotter:
+                        self.plotter.save_plot_of_layout(text_regions_p, image_page)
+                        self.plotter.save_plot_of_layout_all(text_regions_p, image_page)
 
                     K.clear_session()
                     gc.collect()
@@ -2696,8 +2605,8 @@ class eynollah:
                         boxes_d = return_boxes_of_images_by_order_of_reading_new(spliter_y_new_d, regions_without_seperators_d, matrix_of_lines_ch_d, num_col_classifier)
 
                 # print(slopes)
-                if self.dir_of_cropped_images is not None:
-                    self.write_images_into_directory(polygons_of_images, self.dir_of_cropped_images, image_page)
+                if self.plotter:
+                    self.plotter.write_images_into_directory(polygons_of_images, image_page)
 
                 if self.full_layout:
                     if np.abs(slope_deskew) < SLOPE_THRESHOLD:
diff --git a/sbb_newspapers_org_image/plot.py b/sbb_newspapers_org_image/plot.py
new file mode 100644
index 0000000..cba8b58
--- /dev/null
+++ b/sbb_newspapers_org_image/plot.py
@@ -0,0 +1,159 @@
+import matplotlib.pyplot as plt
+import matplotlib.patches as mpatches
+import numpy as np
+import os.path
+import cv2
+from scipy.ndimage import gaussian_filter1d
+
+from .utils import crop_image_inside_box
+from .utils.rotate import rotyate_image_different
+from .utils.resize import resize_image
+
+class EynollahPlotter():
+    """
+    Class collecting all the plotting and image writing methods
+    """
+
+    def __init__(
+        self,
+        *,
+        dir_of_all,
+        dir_of_deskewed,
+        dir_of_layout,
+        dir_of_cropped_images,
+        image_filename,
+        image_filename_stem,
+        image_org=None,
+        scale_x=1,
+        scale_y=1,
+    ):
+        self.dir_of_all = dir_of_all
+        self.dir_of_layout = dir_of_layout
+        self.dir_of_cropped_images = dir_of_cropped_images
+        self.dir_of_deskewed = dir_of_deskewed
+        self.image_filename = image_filename
+        self.image_filename_stem = image_filename_stem
+        # XXX TODO hacky these cannot be set at init time
+        self.image_org = image_org
+        self.scale_x = scale_x
+        self.scale_y = scale_y
+
+    def save_plot_of_layout_main(self, text_regions_p, image_page):
+        values = np.unique(text_regions_p[:, :])
+        # pixels=['Background' , 'Main text' , 'Heading' , 'Marginalia' ,'Drop capitals' , 'Images' , 'Seperators' , 'Tables', 'Graphics']
+        pixels=['Background' , 'Main text'  , 'Image' , 'Separator','Marginalia']
+        values_indexes = [0, 1, 2, 3, 4]
+        plt.figure(figsize=(40, 40))
+        plt.rcParams["font.size"] = "40"
+        im = plt.imshow(text_regions_p[:, :])
+        colors = [im.cmap(im.norm(value)) for value in values]
+        patches = [mpatches.Patch(color=colors[np.where(values == i)[0][0]], label="{l}".format(l=pixels[int(np.where(values_indexes == i)[0][0])])) for i in values]
+        plt.legend(handles=patches, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0, fontsize=40)
+        plt.savefig(os.path.join(self.dir_of_layout, self.image_filename_stem + "_layout_main.png"))
+
+    def save_plot_of_layout_main_all(self, text_regions_p, image_page):
+        values = np.unique(text_regions_p[:, :])
+        # pixels=['Background' , 'Main text' , 'Heading' , 'Marginalia' ,'Drop capitals' , 'Images' , 'Seperators' , 'Tables', 'Graphics']
+        pixels=['Background' , 'Main text'  , 'Image' , 'Separator','Marginalia']
+        values_indexes = [0, 1, 2, 3, 4]
+        plt.figure(figsize=(80, 40))
+        plt.rcParams["font.size"] = "40"
+        plt.subplot(1, 2, 1)
+        plt.imshow(image_page)
+        plt.subplot(1, 2, 2)
+        im = plt.imshow(text_regions_p[:, :])
+        colors = [im.cmap(im.norm(value)) for value in values]
+        patches = [mpatches.Patch(color=colors[np.where(values == i)[0][0]], label="{l}".format(l=pixels[int(np.where(values_indexes == i)[0][0])])) for i in values]
+        plt.legend(handles=patches, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0, fontsize=60)
+        plt.savefig(os.path.join(self.dir_of_all, self.image_filename_stem + "_layout_main_and_page.png"))
+
+    def save_plot_of_layout(self, text_regions_p, image_page):
+        values = np.unique(text_regions_p[:, :])
+        # pixels=['Background' , 'Main text' , 'Heading' , 'Marginalia' ,'Drop capitals' , 'Images' , 'Seperators' , 'Tables', 'Graphics']
+        pixels = ["Background", "Main text", "Header", "Marginalia", "Drop capital", "Image", "Separator"]
+        values_indexes = [0, 1, 2, 8, 4, 5, 6]
+        plt.figure(figsize=(40, 40))
+        plt.rcParams["font.size"] = "40"
+        im = plt.imshow(text_regions_p[:, :])
+        colors = [im.cmap(im.norm(value)) for value in values]
+        patches = [mpatches.Patch(color=colors[np.where(values == i)[0][0]], label="{l}".format(l=pixels[int(np.where(values_indexes == i)[0][0])])) for i in values]
+        plt.legend(handles=patches, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0, fontsize=40)
+        plt.savefig(os.path.join(self.dir_of_layout, self.image_filename_stem + "_layout.png"))
+
+    def save_plot_of_layout_all(self, text_regions_p, image_page):
+        values = np.unique(text_regions_p[:, :])
+        # pixels=['Background' , 'Main text' , 'Heading' , 'Marginalia' ,'Drop capitals' , 'Images' , 'Seperators' , 'Tables', 'Graphics']
+        pixels = ["Background", "Main text", "Header", "Marginalia", "Drop capital", "Image", "Separator"]
+        values_indexes = [0, 1, 2, 8, 4, 5, 6]
+        plt.figure(figsize=(80, 40))
+        plt.rcParams["font.size"] = "40"
+        plt.subplot(1, 2, 1)
+        plt.imshow(image_page)
+        plt.subplot(1, 2, 2)
+        im = plt.imshow(text_regions_p[:, :])
+        colors = [im.cmap(im.norm(value)) for value in values]
+        patches = [mpatches.Patch(color=colors[np.where(values == i)[0][0]], label="{l}".format(l=pixels[int(np.where(values_indexes == i)[0][0])])) for i in values]
+        plt.legend(handles=patches, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0, fontsize=60)
+        plt.savefig(os.path.join(self.dir_of_all, self.image_filename_stem + "_layout_and_page.png"))
+
+    def save_plot_of_textlines(self, textline_mask_tot_ea, image_page):
+        values = np.unique(textline_mask_tot_ea[:, :])
+        pixels = ["Background", "Textlines"]
+        values_indexes = [0, 1]
+        plt.figure(figsize=(80, 40))
+        plt.rcParams["font.size"] = "40"
+        plt.subplot(1, 2, 1)
+        plt.imshow(image_page)
+        plt.subplot(1, 2, 2)
+        im = plt.imshow(textline_mask_tot_ea[:, :])
+        colors = [im.cmap(im.norm(value)) for value in values]
+        patches = [mpatches.Patch(color=colors[np.where(values == i)[0][0]], label="{l}".format(l=pixels[int(np.where(values_indexes == i)[0][0])])) for i in values]
+        plt.legend(handles=patches, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0, fontsize=60)
+        plt.savefig(os.path.join(self.dir_of_all, self.image_filename_stem + "_textline_and_page.png"))
+
+    def save_deskewed_image(self, slope_deskew):
+        if self.dir_of_all is not None:
+            img_rotated = rotyate_image_different(self.image_org, slope_deskew)
+            cv2.imwrite(os.path.join(self.dir_of_all, self.image_filename_stem + "_org.png"), self.image_org)
+        cv2.imwrite(os.path.join(self.dir_of_deskewed, self.image_filename_stem + "_deskewed.png"), img_rotated)
+
+    def save_page_image(self, image_page):
+        cv2.imwrite(os.path.join(self.dir_of_all, self.image_filename_stem + "_page.png"), image_page)
+
+    def save_plot_of_textline_density(self, img_patch_org):
+        plt.figure(figsize=(80,40))
+        plt.rcParams['font.size']='50'
+        plt.subplot(1,2,1)
+        plt.imshow(img_patch_org)
+        plt.subplot(1,2,2)
+        plt.plot(gaussian_filter1d(img_patch_org.sum(axis=1), 3),np.array(range(len(gaussian_filter1d(img_patch_org.sum(axis=1), 3)))),linewidth=8)
+        plt.xlabel('Density of textline prediction in direction of X axis',fontsize=60)
+        plt.ylabel('Height',fontsize=60)
+        plt.yticks([0,len(gaussian_filter1d(img_patch_org.sum(axis=1), 3))])
+        plt.gca().invert_yaxis()
+        plt.savefig(os.path.join(self.dir_of_all, self.image_filename_stem+'_density_of_textline.png'))
+
+    def save_plot_of_rotation_angle(self, angels, var_res):
+        #print('galdi?')
+        plt.figure(figsize=(60,30))
+        plt.rcParams['font.size']='50'
+        plt.plot(angels,np.array(var_res),'-o',markersize=25,linewidth=4)
+        plt.xlabel('angle',fontsize=50)
+        plt.ylabel('variance of sum of rotated textline in direction of x axis',fontsize=50)
+        plt.plot(angels[np.argmax(var_res)],var_res[np.argmax(np.array(var_res))]  ,'*',markersize=50,label='Angle of deskewing=' +str("{:.2f}".format(angels[np.argmax(var_res)]))+r'$\degree$')
+        plt.legend(loc='best')
+        plt.savefig(os.path.join(self.dir_of_all, self.image_filename_stem+'_rotation_angle.png'))
+
+    def write_images_into_directory(self, img_contoures, image_page):
+        index = 0
+        for cont_ind in img_contoures:
+            x, y, w, h = cv2.boundingRect(cont_ind)
+            box = [x, y, w, h]
+            croped_page, page_coord = crop_image_inside_box(box, image_page)
+
+            croped_page = resize_image(croped_page, int(croped_page.shape[0] / self.scale_y), int(croped_page.shape[1] / self.scale_x))
+
+            path = os.path.join(self.dir_of_cropped_images, self.image_filename_stem + "_" + str(index) + ".jpg")
+            cv2.imwrite(path, croped_page)
+            index += 1
+
diff --git a/sbb_newspapers_org_image/utils/__init__.py b/sbb_newspapers_org_image/utils/__init__.py
index cc96297..781864d 100644
--- a/sbb_newspapers_org_image/utils/__init__.py
+++ b/sbb_newspapers_org_image/utils/__init__.py
@@ -372,108 +372,9 @@ def boosting_headers_by_longshot_region_segmentation(textregion_pre_p, textregio
 
 
 def find_num_col_deskew(regions_without_seperators, sigma_, multiplier=3.8):
-    regions_without_seperators_0=regions_without_seperators[:,:].sum(axis=1)
-
-    ##meda_n_updown=regions_without_seperators_0[len(regions_without_seperators_0)::-1]
-
-    ##first_nonzero=(next((i for i, x in enumerate(regions_without_seperators_0) if x), 0))
-    ##last_nonzero=(next((i for i, x in enumerate(meda_n_updown) if x), 0))
-
-    ##last_nonzero=len(regions_without_seperators_0)-last_nonzero
-
-
-    y=regions_without_seperators_0#[first_nonzero:last_nonzero]
-
-    ##y_help=np.zeros(len(y)+20)
-
-    ##y_help[10:len(y)+10]=y
-
-    ##x=np.array( range(len(y)) )
-
-
-
-
-    ##zneg_rev=-y_help+np.max(y_help)
-
-    ##zneg=np.zeros(len(zneg_rev)+20)
-
-    ##zneg[10:len(zneg_rev)+10]=zneg_rev
-
-    z=gaussian_filter1d(y, sigma_)
-    ###zneg= gaussian_filter1d(zneg, sigma_)
-
-
-    ###peaks_neg, _ = find_peaks(zneg, height=0)
-    ###peaks, _ = find_peaks(z, height=0)
-
-    ###peaks_neg=peaks_neg-10-10
-
-    ####print(np.std(z),'np.std(z)np.std(z)np.std(z)')
-
-    #####plt.plot(z)
-    #####plt.show()
-
-    #####plt.imshow(regions_without_seperators)
-    #####plt.show()
-    ###"""
-    ###last_nonzero=last_nonzero-0#100
-    ###first_nonzero=first_nonzero+0#+100
-
-    ###peaks_neg=peaks_neg[(peaks_neg>first_nonzero) & (peaks_neg<last_nonzero)]
-
-    ###peaks=peaks[(peaks>.06*regions_without_seperators.shape[1]) & (peaks<0.94*regions_without_seperators.shape[1])]
-    ###"""
-    ###interest_pos=z[peaks]
-
-    ###interest_pos=interest_pos[interest_pos>10]
-
-    ###interest_neg=z[peaks_neg]
-
-    ###min_peaks_pos=np.mean(interest_pos)
-    ###min_peaks_neg=0#np.min(interest_neg)
-
-    ###dis_talaei=(min_peaks_pos-min_peaks_neg)/multiplier
-    ####print(interest_pos)
-    ###grenze=min_peaks_pos-dis_talaei#np.mean(y[peaks_neg[0]:peaks_neg[len(peaks_neg)-1]])-np.std(y[peaks_neg[0]:peaks_neg[len(peaks_neg)-1]])/2.0
-
-    ###interest_neg_fin=interest_neg[(interest_neg<grenze)]
-    ###peaks_neg_fin=peaks_neg[(interest_neg<grenze)]
-    ###interest_neg_fin=interest_neg[(interest_neg<grenze)]
-
-    ###"""
-    ###if interest_neg[0]<0.1:
-        ###interest_neg=interest_neg[1:]
-    ###if interest_neg[len(interest_neg)-1]<0.1:
-        ###interest_neg=interest_neg[:len(interest_neg)-1]
-
-
-
-    ###min_peaks_pos=np.min(interest_pos)
-    ###min_peaks_neg=0#np.min(interest_neg)
-
-
-    ###dis_talaei=(min_peaks_pos-min_peaks_neg)/multiplier
-    ###grenze=min_peaks_pos-dis_talaei#np.mean(y[peaks_neg[0]:peaks_neg[len(peaks_neg)-1]])-np.std(y[peaks_neg[0]:peaks_neg[len(peaks_neg)-1]])/2.0
-    ###"""
-    ####interest_neg_fin=interest_neg#[(interest_neg<grenze)]
-    ####peaks_neg_fin=peaks_neg#[(interest_neg<grenze)]
-    ####interest_neg_fin=interest_neg#[(interest_neg<grenze)]
-
-    ###num_col=(len(interest_neg_fin))+1
-
-
-    ###p_l=0
-    ###p_u=len(y)-1
-    ###p_m=int(len(y)/2.)
-    ###p_g_l=int(len(y)/3.)
-    ###p_g_u=len(y)-int(len(y)/3.)
-
-
-    ###diff_peaks=np.abs( np.diff(peaks_neg_fin) )
-    ###diff_peaks_annormal=diff_peaks[diff_peaks<30]
-
-    #print(len(interest_neg_fin),np.mean(interest_neg_fin))
-    return np.std(z)#interest_neg_fin,np.std(z)
+    regions_without_seperators_0 = regions_without_seperators[:,:].sum(axis=1)
+    z = gaussian_filter1d(regions_without_seperators_0, sigma_)
+    return np.std(z)
 
 def return_hor_spliter_by_index_for_without_verticals(peaks_neg_fin_t, x_min_hor_some, x_max_hor_some):
     # print(peaks_neg_fin_t,x_min_hor_some,x_max_hor_some)
diff --git a/sbb_newspapers_org_image/utils/separate_lines.py b/sbb_newspapers_org_image/utils/separate_lines.py
index a7b0b90..071116b 100644
--- a/sbb_newspapers_org_image/utils/separate_lines.py
+++ b/sbb_newspapers_org_image/utils/separate_lines.py
@@ -1,4 +1,3 @@
-import matplotlib.pyplot as plt
 import numpy as np
 import cv2
 from scipy.signal import find_peaks
@@ -1485,7 +1484,7 @@ def textline_contours_postprocessing(textline_mask, slope, contour_text_interest
 
     return contours_rotated_clean
 
-def seperate_lines_new2(img_path, thetha, num_col, slope_region, dir_of_all, image_filename_stem):
+def seperate_lines_new2(img_path, thetha, num_col, slope_region, plotter=None):
 
     if num_col == 1:
         num_patches = int(img_path.shape[1] / 200.0)
@@ -1536,7 +1535,7 @@ def seperate_lines_new2(img_path, thetha, num_col, slope_region, dir_of_all, ima
 
         sigma = 2
         try:
-            slope_xline = return_deskew_slop(img_xline, sigma, dir_of_all=dir_of_all, image_filename_stem=image_filename_stem)
+            slope_xline = return_deskew_slop(img_xline, sigma, plotter=plotter)
         except:
             slope_xline = 0
 
@@ -1593,29 +1592,10 @@ def seperate_lines_new2(img_path, thetha, num_col, slope_region, dir_of_all, ima
     # plt.show()
     return img_patch_ineterst_revised
 
-def return_deskew_slop(img_patch_org, sigma_des, main_page=False, dir_of_all=None, image_filename_stem=None):
+def return_deskew_slop(img_patch_org, sigma_des, main_page=False, plotter=None):
 
-
-    if main_page and dir_of_all is not None:
-
-
-        plt.figure(figsize=(80,40))
-        plt.rcParams['font.size']='50'
-        plt.subplot(1,2,1)
-        plt.imshow(img_patch_org)
-        plt.subplot(1,2,2)
-        plt.plot(gaussian_filter1d(img_patch_org.sum(axis=1), 3),np.array(range(len(gaussian_filter1d(img_patch_org.sum(axis=1), 3)))),linewidth=8)
-        plt.xlabel('Density of textline prediction in direction of X axis',fontsize=60)
-        plt.ylabel('Height',fontsize=60)
-        plt.yticks([0,len(gaussian_filter1d(img_patch_org.sum(axis=1), 3))])
-        plt.gca().invert_yaxis()
-
-        plt.savefig(os.path.join(dir_of_all, image_filename_stem+'_density_of_textline.png'))
-    #print(np.max(img_patch_org.sum(axis=0)) ,np.max(img_patch_org.sum(axis=1)),'axislar')
-
-    #img_patch_org=resize_image(img_patch_org,int(img_patch_org.shape[0]*2.5),int(img_patch_org.shape[1]/2.5))
-
-    #print(np.max(img_patch_org.sum(axis=0)) ,np.max(img_patch_org.sum(axis=1)),'axislar2')
+    if main_page and plotter:
+        plotter.save_plot_of_textline_density(img_patch_org)
 
     img_int=np.zeros((img_patch_org.shape[0],img_patch_org.shape[1]))
     img_int[:,:]=img_patch_org[:,:]#img_patch_org[:,:,0]
@@ -1713,17 +1693,8 @@ def return_deskew_slop(img_patch_org, sigma_des, main_page=False, dir_of_all=Non
             var_res.append(var_spectrum)
 
 
-        if dir_of_all is not None:
-            #print('galdi?')
-            plt.figure(figsize=(60,30))
-            plt.rcParams['font.size']='50'
-            plt.plot(angels,np.array(var_res),'-o',markersize=25,linewidth=4)
-            plt.xlabel('angle',fontsize=50)
-            plt.ylabel('variance of sum of rotated textline in direction of x axis',fontsize=50)
-
-            plt.plot(angels[np.argmax(var_res)],var_res[np.argmax(np.array(var_res))]  ,'*',markersize=50,label='Angle of deskewing=' +str("{:.2f}".format(angels[np.argmax(var_res)]))+r'$\degree$')
-            plt.legend(loc='best')
-            plt.savefig(os.path.join(dir_of_all,image_filename_stem+'_rotation_angle.png'))
+        if plotter:
+            plotter.save_plot_of_rotation_angle(angels, var_res)
         try:
             var_res=np.array(var_res)
             ang_int=angels[np.argmax(var_res)]#angels_sorted[arg_final]#angels[arg_sort_early[arg_sort[arg_final]]]#angels[arg_fin]

From 306e2a0a1344621dd37e263a6f4191298a5a0c3d Mon Sep 17 00:00:00 2001
From: vahidrezanezhad <vahid631983@gmail.com>
Date: Wed, 3 Feb 2021 14:49:46 +0100
Subject: [PATCH 21/89] Headers were not written correctly. Fixed

---
 sbb_newspapers_org_image/eynollah.py | 36 ++++++++++++++++++++--------
 1 file changed, 26 insertions(+), 10 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index baf9f92..3267543 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -33,6 +33,7 @@ from lxml import etree as ET
 from matplotlib import pyplot, transforms
 import matplotlib.patches as mpatches
 import imutils
+import matplotlib.pyplot as plt
 
 from .utils.contour import (
     contours_in_same_horizon,
@@ -454,9 +455,12 @@ class eynollah:
 
         # Also set for the plotter
         # XXX TODO hacky
-        self.plotter.image_org = self.image_org
-        self.plotter.scale_y = self.scale_y
-        self.plotter.scale_x = self.scale_x
+        #self.plotter.image_org = self.image_org
+        
+        #self.plotter.scale_y = self.scale_y
+        #self.plotter.scale_x = self.scale_x
+        
+
 
     def get_image_and_scales_after_enhancing(self, img_org, img_res):
 
@@ -468,6 +472,7 @@ class eynollah:
 
         self.scale_y = img_res.shape[0] / float(self.image_org.shape[0])
         self.scale_x = img_res.shape[1] / float(self.image_org.shape[1])
+        
 
         del img_org
         del img_res
@@ -1324,9 +1329,11 @@ class eynollah:
                 #else:
                 #    textregion.set('type','paragraph')
                 coord_text = ET.SubElement(textregion, 'Coords')
+                
+                coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_h, mm, page_coord))
 
                     
-                id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons_h, mm, page_coord, all_box_coord, slopes, id_indexer_l)
+                id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons_h, mm, page_coord, all_box_coord_h, slopes, id_indexer_l)
                 texteqreg=ET.SubElement(textregion, 'TextEquiv')
     
                 unireg=ET.SubElement(texteqreg, 'Unicode')
@@ -1459,9 +1466,14 @@ class eynollah:
         except:
             pass
 
-        print(dir_of_image)
-        print(self.f_name)
-        print(os.path.join(dir_of_image, self.f_name) + ".xml")
+        ##print(dir_of_image)
+        ##print(self.f_name)
+        ##print(os.path.join(dir_of_image, self.f_name) + ".xml")
+        ##tree = ET.ElementTree(pcgts)
+        ##tree.write(os.path.join(dir_of_image, self.image_filename_stem) + ".xml")
+        
+        print(self.image_filename_stem)
+        # print(os.path.join(dir_of_image, self.image_filename_stem) + ".xml")
         tree = ET.ElementTree(pcgts)
         tree.write(os.path.join(dir_of_image, self.image_filename_stem) + ".xml")
     
@@ -2134,7 +2146,9 @@ class eynollah:
             img_org, img_res, is_image_enhanced = self.resize_image_with_column_classifier(is_image_enhanced)
             self.get_image_and_scales_after_enhancing(img_org, img_res)
 
-        # print(self.scale_x)
+        
+
+        
 
         print("enhancing: " + str(time.time() - t1))
         text_regions_p_1 = self.get_regions_from_xy_2models(img_res, is_image_enhanced)
@@ -2377,8 +2391,8 @@ class eynollah:
                     text_regions_p[:, :][regions_fully[:, :, 0] == 4] = 4
                     text_regions_p[:, :][regions_fully_np[:, :, 0] == 4] = 4
 
-                    # plt.imshow(text_regions_p)
-                    # plt.show()
+                    #plt.imshow(text_regions_p)
+                    #plt.show()
 
                     if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
                         image_page_rotated_n, textline_mask_tot_d, text_regions_p_1_n, regions_fully_n = rotation_not_90_func_full_layout(image_page, textline_mask_tot, text_regions_p, regions_fully, slope_deskew)
@@ -2541,6 +2555,8 @@ class eynollah:
                     else:
                         contours_only_text_parent_d_ordered = None
                         text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_texline_polygons, all_found_texline_polygons_h, slopes, slopes_h, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_texline_polygons, slopes, contours_only_text_parent_d_ordered)
+                        
+                        
 
                     if self.plotter:
                         self.plotter.save_plot_of_layout(text_regions_p, image_page)

From 39052032945f12fd005afb1b14d3f46c75871b36 Mon Sep 17 00:00:00 2001
From: vahidrezanezhad <vahid631983@gmail.com>
Date: Wed, 3 Feb 2021 14:51:12 +0100
Subject: [PATCH 22/89]  matplotlib.pyplot removed

---
 sbb_newspapers_org_image/eynollah.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index 3267543..d515d14 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -33,7 +33,6 @@ from lxml import etree as ET
 from matplotlib import pyplot, transforms
 import matplotlib.patches as mpatches
 import imutils
-import matplotlib.pyplot as plt
 
 from .utils.contour import (
     contours_in_same_horizon,

From 4e1956df5e0110848a6e6fd642eb987a33538a34 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Thu, 4 Feb 2021 15:21:14 +0100
Subject: [PATCH 23/89] do an actual test run

---
 .github/workflows/test-eynollah.yml |  2 +-
 Makefile                            |  3 ++
 tests/__init__.py                   |  0
 tests/base.py                       | 54 +++++++++++++++++++++++++++++
 tests/test_run.py                   | 24 +++++++++++++
 5 files changed, 82 insertions(+), 1 deletion(-)
 create mode 100644 tests/__init__.py
 create mode 100644 tests/base.py
 create mode 100644 tests/test_run.py

diff --git a/.github/workflows/test-eynollah.yml b/.github/workflows/test-eynollah.yml
index 4dfb772..e58d26b 100644
--- a/.github/workflows/test-eynollah.yml
+++ b/.github/workflows/test-eynollah.yml
@@ -32,4 +32,4 @@ jobs:
         python -m pip install --upgrade pip
         pip install .
     - name: Test with pytest
-      run: echo success # make test
+      run: make test
diff --git a/Makefile b/Makefile
index 9855fa5..920f15b 100644
--- a/Makefile
+++ b/Makefile
@@ -1,3 +1,6 @@
+EYNOLLAH_MODELS ?= $(PWD)/models_eynollah
+export EYNOLLAH_MODELS
+
 # BEGIN-EVAL makefile-parser --make-help Makefile
 
 help:
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/base.py b/tests/base.py
new file mode 100644
index 0000000..9de35ef
--- /dev/null
+++ b/tests/base.py
@@ -0,0 +1,54 @@
+# pylint: disable=unused-import
+
+from os.path import dirname, realpath
+from os import chdir
+import sys
+import logging
+import io
+import collections
+from unittest import TestCase as VanillaTestCase, skip, main as unittests_main
+import pytest
+from ocrd_utils import disableLogging, initLogging
+
+def main(fn=None):
+    if fn:
+        sys.exit(pytest.main([fn]))
+    else:
+        unittests_main()
+
+class TestCase(VanillaTestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        chdir(dirname(realpath(__file__)) + '/..')
+
+    def setUp(self):
+        disableLogging()
+        initLogging()
+
+class CapturingTestCase(TestCase):
+    """
+    A TestCase that needs to capture stderr/stdout and invoke click CLI.
+    """
+
+    @pytest.fixture(autouse=True)
+    def _setup_pytest_capfd(self, capfd):
+        self.capfd = capfd
+
+    def invoke_cli(self, cli, args):
+        """
+        Substitution for click.CliRunner.invooke that works together nicely
+        with unittests/pytest capturing stdout/stderr.
+        """
+        self.capture_out_err()  # XXX snapshot just before executing the CLI
+        code = 0
+        sys.argv[1:] = args # XXX necessary because sys.argv reflects pytest args not cli args
+        try:
+            cli.main(args=args)
+        except SystemExit as e:
+            code = e.code
+        out, err = self.capture_out_err()
+        return code, out, err
+
+    def capture_out_err(self):
+        return self.capfd.readouterr()
diff --git a/tests/test_run.py b/tests/test_run.py
new file mode 100644
index 0000000..74f7fde
--- /dev/null
+++ b/tests/test_run.py
@@ -0,0 +1,24 @@
+from os import environ
+from pathlib import Path
+from ocrd_utils import pushd_popd
+from tests.base import CapturingTestCase as TestCase, main
+from sbb_newspapers_org_image.cli import main as eynollah_cli
+
+testdir = Path(__file__).parent.resolve()
+
+EYNOLLAH_MODELS = environ.get('EYNOLLAH_MODELS', str(testdir.joinpath('..', 'models_eynollah').resolve()))
+
+class TestEynollahRun(TestCase):
+
+    def test_full_run(self):
+        with pushd_popd(tempdir=True) as tempdir:
+            code, out, err = self.invoke_cli(eynollah_cli, [
+                '-m', EYNOLLAH_MODELS,
+                '-i', str(testdir.joinpath('resources/kant_aufklaerung_1784_0020.tif')),
+                '-o', tempdir
+            ])
+            print(code, out, err)
+            assert not code
+
+if __name__ == '__main__':
+    main(__file__)

From 6bb32c0bfd8344cfbfb40d48c6690340e9a72683 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Thu, 4 Feb 2021 15:27:20 +0100
Subject: [PATCH 24/89] ci: install test dependencies

---
 .github/workflows/test-eynollah.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/test-eynollah.yml b/.github/workflows/test-eynollah.yml
index e58d26b..1afd2a6 100644
--- a/.github/workflows/test-eynollah.yml
+++ b/.github/workflows/test-eynollah.yml
@@ -31,5 +31,6 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         pip install .
+        pip install -r requirements-test.txt
     - name: Test with pytest
       run: make test

From ef1e32ee977ecb18de38b9f13d5f11f228a9276f Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Thu, 4 Feb 2021 15:37:36 +0100
Subject: [PATCH 25/89] restrict keras version to < 2.4

---
 requirements.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 8368e37..4bb6103 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
 # ocrd includes opencv, numpy, shapely, click
-ocrd >= 2.20.1
-keras >= 2.3.1
+ocrd >= 2.20.3
+keras >= 2.3.1, < 2.4
 scikit-learn >= 0.23.2
 tensorflow-gpu >= 1.15, < 2
 imutils >= 0.5.3

From e0418773d9a8fe060fc68be74a26bbcb52efdf0d Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Thu, 4 Feb 2021 17:39:00 +0100
Subject: [PATCH 26/89] start logging

---
 sbb_newspapers_org_image/cli.py      |  2 ++
 sbb_newspapers_org_image/eynollah.py | 29 ++++++++++++++++------------
 2 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/sbb_newspapers_org_image/cli.py b/sbb_newspapers_org_image/cli.py
index c9fd772..369afc8 100644
--- a/sbb_newspapers_org_image/cli.py
+++ b/sbb_newspapers_org_image/cli.py
@@ -1,4 +1,5 @@
 import click
+from ocrd_utils import initLogging
 from sbb_newspapers_org_image.eynollah import eynollah
 
 
@@ -98,6 +99,7 @@ def main(
     allow_scaling,
     headers_off,
 ):
+    initLogging()
     eynollah(
         image,
         None,
diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index d515d14..365012f 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -12,6 +12,7 @@ import time
 import warnings
 from pathlib import Path
 from multiprocessing import Process, Queue, cpu_count
+from ocrd_utils import getLogger
 
 import cv2
 import numpy as np
@@ -156,6 +157,7 @@ class eynollah:
             image_filename=image_filename,
             image_filename_stem=image_filename_stem,
         )
+        self.logger = getLogger('eynollah')
         self.dir_models = dir_models
         self.kernel = np.ones((5, 5), np.uint8)
 
@@ -382,7 +384,9 @@ class eynollah:
         return img, img_new, is_image_enhanced
 
     def resize_and_enhance_image_with_column_classifier(self, is_image_enhanced):
+        self.logger.debug("enter resize_and_enhance_image_with_column_classifier")
         dpi = self.check_dpi()
+        self.logger.info("Detected %s DPI" % dpi)
         img = cv2.imread(self.image_filename)
 
         img = img.astype(np.uint8)
@@ -477,6 +481,7 @@ class eynollah:
         del img_res
 
     def start_new_session_and_model(self, model_dir):
+        self.logger.debug("enter start_new_session_and_model")
         config = tf.ConfigProto()
         config.gpu_options.allow_growth = True
 
@@ -486,6 +491,7 @@ class eynollah:
         return model, session
 
     def do_prediction(self, patches, img, model, marginal_of_patch_percent=0.1):
+        self.logger.debug("enter do_prediction")
 
         img_height_model = model.layers[len(model.layers) - 1].output_shape[1]
         img_width_model = model.layers[len(model.layers) - 1].output_shape[2]
@@ -615,6 +621,7 @@ class eynollah:
         return prediction_true
 
     def early_page_for_num_of_column_classification(self):
+        self.logger.debug("enter resize_and_enhance_image_with_column_classifier")
         img = cv2.imread(self.image_filename)
         img = img.astype(np.uint8)
         patches = False
@@ -2115,6 +2122,7 @@ class eynollah:
             return order_text_new, id_of_texts_tot
 
     def run(self):
+        self.logger.debug("enter run")
         is_image_enhanced = False
         # get image and sclaes, then extract the page of scanned image
         t1 = time.time()
@@ -2122,39 +2130,36 @@ class eynollah:
         ##########
 
         ###is_image_enhanced,img_org,img_res=self.resize_and_enhance_image(is_image_enhanced)
+        self.logger.info("resize and enhance image")
         is_image_enhanced, img_org, img_res, num_col_classifier, num_column_is_classified = self.resize_and_enhance_image_with_column_classifier(is_image_enhanced)
+        self.logger.info("Image is %senhanced" % 'is ' if is_image_enhanced else '')
 
-        print(is_image_enhanced, "is_image_enhanced")
         K.clear_session()
         scale = 1
-        if (self.allow_enhancement) and is_image_enhanced:
+        if self.allow_enhancement and is_image_enhanced:
             cv2.imwrite(os.path.join(self.dir_out, self.image_filename_stem) + ".tif", img_res)
             img_res = img_res.astype(np.uint8)
             self.get_image_and_scales(img_org, img_res, scale)
 
-        if (not self.allow_enhancement) and is_image_enhanced:
+        if not self.allow_enhancement and is_image_enhanced:
             self.get_image_and_scales_after_enhancing(img_org, img_res)
 
-        if (self.allow_enhancement) and not is_image_enhanced:
+        if self.allow_enhancement and not is_image_enhanced:
             self.get_image_and_scales(img_org, img_res, scale)
 
-        if (not self.allow_enhancement) and not is_image_enhanced:
+        if not self.allow_enhancement and not is_image_enhanced:
             self.get_image_and_scales(img_org, img_res, scale)
 
-        if (self.allow_scaling) and not is_image_enhanced:
+        if self.allow_scaling and not is_image_enhanced:
             img_org, img_res, is_image_enhanced = self.resize_image_with_column_classifier(is_image_enhanced)
             self.get_image_and_scales_after_enhancing(img_org, img_res)
+        self.logger.info("Enhancing took %ss ", str(time.time() - t1))
 
-        
-
-        
-
-        print("enhancing: " + str(time.time() - t1))
         text_regions_p_1 = self.get_regions_from_xy_2models(img_res, is_image_enhanced)
         K.clear_session()
         gc.collect()
 
-        print("textregion: " + str(time.time() - t1))
+        print("Textregion detection took %ss " + str(time.time() - t1))
 
         img_g = cv2.imread(self.image_filename, 0)
         img_g = img_g.astype(np.uint8)

From 3b10128c8f2f46015bc3094377ceb81508f635d9 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Thu, 4 Feb 2021 18:28:27 +0100
Subject: [PATCH 27/89] simplify if-else logic for enhancement

---
 sbb_newspapers_org_image/eynollah.py | 32 +++++++++++++---------------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index 365012f..37f2d57 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -2136,23 +2136,21 @@ class eynollah:
 
         K.clear_session()
         scale = 1
-        if self.allow_enhancement and is_image_enhanced:
-            cv2.imwrite(os.path.join(self.dir_out, self.image_filename_stem) + ".tif", img_res)
-            img_res = img_res.astype(np.uint8)
-            self.get_image_and_scales(img_org, img_res, scale)
-
-        if not self.allow_enhancement and is_image_enhanced:
-            self.get_image_and_scales_after_enhancing(img_org, img_res)
-
-        if self.allow_enhancement and not is_image_enhanced:
-            self.get_image_and_scales(img_org, img_res, scale)
-
-        if not self.allow_enhancement and not is_image_enhanced:
-            self.get_image_and_scales(img_org, img_res, scale)
-
-        if self.allow_scaling and not is_image_enhanced:
-            img_org, img_res, is_image_enhanced = self.resize_image_with_column_classifier(is_image_enhanced)
-            self.get_image_and_scales_after_enhancing(img_org, img_res)
+        if is_image_enhanced:
+            if self.allow_enhancement:
+                cv2.imwrite(os.path.join(self.dir_out, self.image_filename_stem) + ".tif", img_res)
+                img_res = img_res.astype(np.uint8)
+                self.get_image_and_scales(img_org, img_res, scale)
+            else:
+                self.get_image_and_scales_after_enhancing(img_org, img_res)
+        else:
+            if self.allow_enhancement:
+                self.get_image_and_scales(img_org, img_res, scale)
+            else:
+                self.get_image_and_scales(img_org, img_res, scale)
+            if self.allow_scaling:
+                img_org, img_res, is_image_enhanced = self.resize_image_with_column_classifier(is_image_enhanced)
+                self.get_image_and_scales_after_enhancing(img_org, img_res)
         self.logger.info("Enhancing took %ss ", str(time.time() - t1))
 
         text_regions_p_1 = self.get_regions_from_xy_2models(img_res, is_image_enhanced)

From 2929ba1ee355327fbdb1c993e3970c9649b323a3 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Thu, 4 Feb 2021 18:33:06 +0100
Subject: [PATCH 28/89] imread: use cv2.THREAD_GRAYSCALE constant rather than 0

---
 sbb_newspapers_org_image/eynollah.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index 37f2d57..19c0fce 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -347,7 +347,7 @@ class eynollah:
         _, page_coord = self.early_page_for_num_of_column_classification()
         model_num_classifier, session_col_classifier = self.start_new_session_and_model(self.model_dir_of_col_classifier)
 
-        img_1ch = cv2.imread(self.image_filename, 0)
+        img_1ch = cv2.imread(self.image_filename, cv.IMREAD_GRAYSCALE)
         width_early = img_1ch.shape[1]
         img_1ch = img_1ch[page_coord[0] : page_coord[1], page_coord[2] : page_coord[3]]
 
@@ -394,7 +394,7 @@ class eynollah:
         _, page_coord = self.early_page_for_num_of_column_classification()
         model_num_classifier, session_col_classifier = self.start_new_session_and_model(self.model_dir_of_col_classifier)
 
-        img_1ch = cv2.imread(self.image_filename, 0)
+        img_1ch = cv2.imread(self.image_filename, cv2.IMREAD_GRAYSCALE)
         img_1ch = img_1ch.astype(np.uint8)
 
         width_early = img_1ch.shape[1]
@@ -1673,6 +1673,7 @@ class eynollah:
         # cv2.imwrite(os.path.join(dir_of_image, self.image_filename_stem) + ".tif",self.image_org)
 
     def get_regions_from_xy_2models(self,img,is_image_enhanced):
+        self.logger.debug("enter get_regions_from_xy_2models")
         img_org = np.copy(img)
         img_height_h = img_org.shape[0]
         img_width_h = img_org.shape[1]
@@ -2132,7 +2133,7 @@ class eynollah:
         ###is_image_enhanced,img_org,img_res=self.resize_and_enhance_image(is_image_enhanced)
         self.logger.info("resize and enhance image")
         is_image_enhanced, img_org, img_res, num_col_classifier, num_column_is_classified = self.resize_and_enhance_image_with_column_classifier(is_image_enhanced)
-        self.logger.info("Image is %senhanced" % 'is ' if is_image_enhanced else '')
+        self.logger.info("Image is %senhanced", '' if is_image_enhanced else 'not ')
 
         K.clear_session()
         scale = 1
@@ -2156,10 +2157,9 @@ class eynollah:
         text_regions_p_1 = self.get_regions_from_xy_2models(img_res, is_image_enhanced)
         K.clear_session()
         gc.collect()
+        self.logger.info("Textregion detection took %ss " + str(time.time() - t1))
 
-        print("Textregion detection took %ss " + str(time.time() - t1))
-
-        img_g = cv2.imread(self.image_filename, 0)
+        img_g = cv2.imread(self.image_filename, cv2.IMREAD_GRAYSCALE)
         img_g = img_g.astype(np.uint8)
 
         img_g3 = np.zeros((img_g.shape[0], img_g.shape[1], 3))

From 8cd4067fc56518734a8e46497a22e06b662fffb9 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Thu, 4 Feb 2021 18:46:03 +0100
Subject: [PATCH 29/89] cli: add negative flag variants (--no/-no)

---
 sbb_newspapers_org_image/cli.py      | 24 ++++++++++++------------
 sbb_newspapers_org_image/eynollah.py |  7 +++++--
 2 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/sbb_newspapers_org_image/cli.py b/sbb_newspapers_org_image/cli.py
index 369afc8..75e148e 100644
--- a/sbb_newspapers_org_image/cli.py
+++ b/sbb_newspapers_org_image/cli.py
@@ -49,38 +49,38 @@ from sbb_newspapers_org_image.eynollah import eynollah
     type=click.Path(exists=True, file_okay=False),
 )
 @click.option(
-    "--enable_plotting",
-    "-ep",
+    "--enable-plotting/--disable-plotting",
+    "-ep/-noep",
     is_flag=True,
     help="If set, will plot intermediary files and images",
 )
 @click.option(
-    "--allow_enhancement",
-    "-ae",
+    "--allow-enhancement/--no-allow-enhancement",
+    "-ae/-noae",
     is_flag=True,
     help="if this parameter set to true, this tool would check that input image need resizing and enhancement or not. If so output of resized and enhanced image and corresponding layout data will be written in out directory",
 )
 @click.option(
-    "--curved_line",
-    "-cl",
+    "--curved-line/--no-curvedline",
+    "-cl/-nocl",
     is_flag=True,
     help="if this parameter set to true, this tool will try to return contoure of textlines instead of rectabgle bounding box of textline. This should be taken into account that with this option the tool need more time to do process.",
 )
 @click.option(
-    "--full_layout",
-    "-fl",
+    "--full-layout/--no-full-layout",
+    "-fl/-nofl",
     is_flag=True,
     help="if this parameter set to true, this tool will try to return all elements of layout.",
 )
 @click.option(
-    "--allow_scaling",
-    "-as",
+    "--allow_scaling/--no-allow-scaling",
+    "-as/-noas",
     is_flag=True,
     help="if this parameter set to true, this tool would check the scale and if needed it will scale it to perform better layout detection",
 )
 @click.option(
-    "--headers_off",
-    "-ho",
+    "--headers-off/--headers-on",
+    "-ho/-noho",
     is_flag=True,
     help="if this parameter set to true, this tool would ignore headers role in reading order",
 )
diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index 19c0fce..7a90d68 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -176,6 +176,7 @@ class eynollah:
         self.model_textline_dir = dir_models + "/model_textline_newspapers.h5"  #'/model_hor_ver_home_trextline_very_good.h5'# '/model_hor_ver_1_great.h5'#'/model_curved_office_works_great.h5'
 
     def predict_enhancement(self, img):
+        self.logger.debug("enter predict_enhancement")
         model_enhancement, session_enhancemnet = self.start_new_session_and_model(self.model_dir_of_enhancemnet)
 
         img_height_model = model_enhancement.layers[len(model_enhancement.layers) - 1].output_shape[1]
@@ -279,6 +280,7 @@ class eynollah:
         return int(float(dpi))
 
     def calculate_width_height_by_columns(self, img, num_col, width_early, label_p_pred):
+        self.logger.debug("enter calculate_width_height_by_columns")
         if num_col == 1 and width_early < 1100:
             img_w_new = 2000
             img_h_new = int(img.shape[0] / float(img.shape[1]) * 2000)
@@ -341,6 +343,7 @@ class eynollah:
         return img_new, num_column_is_classified
 
     def resize_image_with_column_classifier(self, is_image_enhanced):
+        self.logger.debug("enter resize_image_with_column_classifier")
         img = cv2.imread(self.image_filename)
         img = img.astype(np.uint8)
 
@@ -444,6 +447,7 @@ class eynollah:
         return is_image_enhanced, img, image_res, num_col, num_column_is_classified
 
     def get_image_and_scales(self, img_org, img_res, scale):
+        self.logger.debug("enter get_image_and_scales")
         self.image = np.copy(img_res)
         self.image_org = np.copy(img_org)
         self.height_org = self.image.shape[0]
@@ -462,11 +466,10 @@ class eynollah:
         
         #self.plotter.scale_y = self.scale_y
         #self.plotter.scale_x = self.scale_x
-        
 
 
     def get_image_and_scales_after_enhancing(self, img_org, img_res):
-
+        self.logger.debug("enter get_image_and_scales_after_enhancing")
         self.image = np.copy(img_res)
         self.image = self.image.astype(np.uint8)
         self.image_org = np.copy(img_org)

From ca23b32e9bf41da6ff877233c15b7d6a0dd449ba Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Thu, 4 Feb 2021 19:55:54 +0100
Subject: [PATCH 30/89] split do_order_of_regions, lots of logging

---
 sbb_newspapers_org_image/eynollah.py | 626 ++++++++++++++-------------
 1 file changed, 323 insertions(+), 303 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index 7a90d68..93d013a 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -276,6 +276,7 @@ class eynollah:
             return prediction_true
 
     def check_dpi(self):
+        self.logger.debug("enter check_dpi")
         dpi = os.popen('identify -format "%x " ' + self.image_filename).read()
         return int(float(dpi))
 
@@ -368,7 +369,7 @@ class eynollah:
         label_p_pred = model_num_classifier.predict(img_in)
         num_col = np.argmax(label_p_pred[0]) + 1
 
-        print(num_col, label_p_pred, "num_col_classifier")
+        self.logger.info("Found %s columns (%s)", num_col, label_p_pred)
 
         session_col_classifier.close()
         del model_num_classifier
@@ -421,7 +422,7 @@ class eynollah:
         label_p_pred = model_num_classifier.predict(img_in)
         num_col = np.argmax(label_p_pred[0]) + 1
 
-        print(num_col, label_p_pred, "num_col_classifier")
+        self.logger.info("Found %s columns (%s)", num_col, label_p_pred)
 
         session_col_classifier.close()
         del model_num_classifier
@@ -431,7 +432,7 @@ class eynollah:
         del page_coord
         K.clear_session()
         gc.collect()
-        print(dpi)
+        self.logger.info("%s DPI" % dpi)
 
         if dpi < 298:
             img_new, num_column_is_classified = self.calculate_width_height_by_columns(img, num_col, width_early, label_p_pred)
@@ -484,7 +485,7 @@ class eynollah:
         del img_res
 
     def start_new_session_and_model(self, model_dir):
-        self.logger.debug("enter start_new_session_and_model")
+        self.logger.debug("enter start_new_session_and_model (model_dir=%s)", model_dir)
         config = tf.ConfigProto()
         config.gpu_options.allow_growth = True
 
@@ -507,7 +508,7 @@ class eynollah:
             if img.shape[1] < img_width_model:
                 img = resize_image(img, img.shape[0], img_width_model)
 
-            # print(img_height_model,img_width_model)
+            self.logger.info("Image dimensions: %sx%s", img_height_model, img_width_model)
             margin = int(marginal_of_patch_percent * img_height_model)
             width_mid = img_width_model - 2 * margin
             height_mid = img_height_model - 2 * margin
@@ -660,9 +661,11 @@ class eynollah:
         del img_page_prediction
 
         gc.collect()
+        self.logger.debug("exit resize_and_enhance_image_with_column_classifier")
         return croped_page, page_coord
 
     def extract_page(self):
+        self.logger.debug("enter extract_page")
         patches = False
         model_page, session_page = self.start_new_session_and_model(self.model_page_dir)
         for ii in range(1):
@@ -708,6 +711,7 @@ class eynollah:
         return croped_page, page_coord
 
     def extract_text_regions(self, img, patches, cols):
+        self.logger.debug("enter extract_text_regions")
         img_height_h = img.shape[0]
         img_width_h = img.shape[1]
 
@@ -809,9 +813,11 @@ class eynollah:
         del session_region
         del img
         gc.collect()
+        self.logger.debug("exit extract_text_regions")
         return prediction_regions, prediction_regions2
 
     def get_slopes_and_deskew_new(self, contours, contours_par, textline_mask_tot, image_page_rotated, boxes, slope_deskew):
+        self.logger.debug("enter get_slopes_and_deskew_new")
         num_cores = cpu_count()
         queue_of_all_params = Queue()
 
@@ -858,10 +864,12 @@ class eynollah:
 
         for i in range(num_cores):
             processes[i].join()
-        # print(slopes,'slopes')
+        self.logger.debug('slopes %s', slopes)
+        self.logger.debug("exit get_slopes_and_deskew_new")
         return slopes, all_found_texline_polygons, boxes, all_found_text_regions, all_found_text_regions_par, all_box_coord, all_index_text_con
 
     def get_slopes_and_deskew_new_curved(self, contours, contours_par, textline_mask_tot, image_page_rotated, boxes, mask_texts_only, num_col, scale_par, slope_deskew):
+        self.logger.debug("enter get_slopes_and_deskew_new_curved")
         num_cores = cpu_count()
         queue_of_all_params = Queue()
 
@@ -912,6 +920,7 @@ class eynollah:
         return all_found_texline_polygons, boxes, all_found_text_regions, all_found_text_regions_par, all_box_coord, all_index_text_con, slopes
 
     def do_work_of_slopes_new_curved(self, queue_of_all_params, boxes_text, textline_mask_tot_ea, contours_per_process, contours_par_per_process, image_page_rotated, mask_texts_only, num_col, scale_par, indexes_r_con_per_pro, slope_deskew):
+        self.logger.debug("enter do_work_of_slopes_new_curved")
         slopes_per_each_subprocess = []
         bounding_box_of_textregion_per_each_subprocess = []
         textlines_rectangles_per_each_subprocess = []
@@ -1021,6 +1030,7 @@ class eynollah:
         queue_of_all_params.put([textlines_rectangles_per_each_subprocess, bounding_box_of_textregion_per_each_subprocess, contours_textregion_per_each_subprocess, contours_textregion_par_per_each_subprocess, all_box_coord_per_process, index_by_text_region_contours, slopes_per_each_subprocess])
 
     def do_work_of_slopes_new(self, queue_of_all_params, boxes_text, textline_mask_tot_ea, contours_per_process, contours_par_per_process, indexes_r_con_per_pro, image_page_rotated, slope_deskew):
+        self.logger.debug('enter do_work_of_slopes_new')
 
         slopes_per_each_subprocess = []
         bounding_box_of_textregion_per_each_subprocess = []
@@ -1095,6 +1105,7 @@ class eynollah:
         queue_of_all_params.put([slopes_per_each_subprocess, textlines_rectangles_per_each_subprocess, bounding_box_of_textregion_per_each_subprocess, contours_textregion_per_each_subprocess, contours_textregion_par_per_each_subprocess, all_box_coord_per_process, index_by_text_region_contours])
 
     def textline_contours(self, img, patches, scaler_h, scaler_w):
+        self.logger.debug('enter textline_contours')
 
         if patches:
             model_textline, session_textline = self.start_new_session_and_model(self.model_textline_dir)
@@ -1127,6 +1138,7 @@ class eynollah:
         return prediction_textline[:, :, 0], prediction_textline_longshot_true_size[:, :, 0]
 
     def do_work_of_slopes(self, q, poly, box_sub, boxes_per_process, textline_mask_tot, contours_per_process):
+        self.logger.debug('enter do_work_of_slopes')
         slope_biggest = 0
         slopes_sub = []
         boxes_sub_new = []
@@ -1167,6 +1179,7 @@ class eynollah:
         box_sub.put(boxes_sub_new)
 
     def serialize_lines_in_region(self, textregion, all_found_texline_polygons, region_idx, page_coord, all_box_coord, slopes, id_indexer_l):
+        self.logger.debug('enter serialize_lines_in_region')
         for j in range(len(all_found_texline_polygons[region_idx])):
             textline=ET.SubElement(textregion, 'TextLine')
             textline.set('id','l'+str(id_indexer_l))
@@ -1245,6 +1258,7 @@ class eynollah:
         return id_indexer_l
 
     def calculate_polygon_coords(self, contour_list, i, page_coord):
+        self.logger.debug('enter calculate_polygon_coords')
         coords = ''
         for j in range(len(contour_list[i])):
             if len(contour_list[i][j]) == 2:
@@ -1262,6 +1276,7 @@ class eynollah:
         return coords
 
     def write_into_page_xml_full(self, contours, contours_h, page_coord, dir_of_image, order_of_texts, id_of_texts, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals):
+        self.logger.debug('enter write_into_page_xml_full')
 
         found_polygons_text_region = contours
         found_polygons_text_region_h = contours_h
@@ -1481,13 +1496,14 @@ class eynollah:
         ##tree = ET.ElementTree(pcgts)
         ##tree.write(os.path.join(dir_of_image, self.image_filename_stem) + ".xml")
         
-        print(self.image_filename_stem)
+        self.logger.info("filename stem: '%s'", self.image_filename_stem)
         # print(os.path.join(dir_of_image, self.image_filename_stem) + ".xml")
         tree = ET.ElementTree(pcgts)
         tree.write(os.path.join(dir_of_image, self.image_filename_stem) + ".xml")
     
 
     def calculate_page_coords(self):
+        self.logger.debug('enter calculate_page_coords')
         points_page_print = ""
         for lmm in range(len(self.cont_page[0])):
             if len(self.cont_page[0][lmm]) == 2:
@@ -1504,6 +1520,7 @@ class eynollah:
         return points_page_print
 
     def write_into_page_xml(self, contours, page_coord, dir_of_image, order_of_texts, id_of_texts, all_found_texline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, curved_line, slopes, slopes_marginals):
+        self.logger.debug('enter write_into_page_xml')
 
         found_polygons_text_region = contours
         ##found_polygons_text_region_h=contours_h
@@ -1669,11 +1686,9 @@ class eynollah:
             pass
 
 
-        print(self.image_filename_stem)
-        # print(os.path.join(dir_of_image, self.image_filename_stem) + ".xml")
+        self.logger.info("filename stem: '%s'", self.image_filename_stem)
         tree = ET.ElementTree(pcgts)
         tree.write(os.path.join(dir_of_image, self.image_filename_stem) + ".xml")
-        # cv2.imwrite(os.path.join(dir_of_image, self.image_filename_stem) + ".tif",self.image_org)
 
     def get_regions_from_xy_2models(self,img,is_image_enhanced):
         self.logger.debug("enter get_regions_from_xy_2models")
@@ -1792,7 +1807,7 @@ class eynollah:
 
         rate_two_models=text_sume_second/float(text_sume_early)*100
 
-        print(rate_two_models,'ratio_of_two_models')
+        self.logger.info("ratio_of_two_models: %s", rate_two_models)
         if is_image_enhanced and rate_two_models<95.50:#98.45:
             pass
         else:
@@ -1843,292 +1858,299 @@ class eynollah:
 
         return text_regions_p_true
 
-    def do_order_of_regions(self, contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot):
+    def do_order_of_regions_full_layout(self, contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot):
+        self.logger.debug("enter do_order_of_regions_full_layout")
+        cx_text_only, cy_text_only, x_min_text_only, _, _, _, y_cor_x_min_main = find_new_features_of_contoures(contours_only_text_parent)
+        cx_text_only_h, cy_text_only_h, x_min_text_only_h, _, _, _, y_cor_x_min_main_h = find_new_features_of_contoures(contours_only_text_parent_h)
 
+        try:
+            arg_text_con = []
+            for ii in range(len(cx_text_only)):
+                for jj in range(len(boxes)):
+                    if (x_min_text_only[ii] + 80) >= boxes[jj][0] and (x_min_text_only[ii] + 80) < boxes[jj][1] and y_cor_x_min_main[ii] >= boxes[jj][2] and y_cor_x_min_main[ii] < boxes[jj][3]:
+                        arg_text_con.append(jj)
+                        break
+            arg_arg_text_con = np.argsort(arg_text_con)
+            args_contours = np.array(range(len(arg_text_con)))
+
+            arg_text_con_h = []
+            for ii in range(len(cx_text_only_h)):
+                for jj in range(len(boxes)):
+                    if (x_min_text_only_h[ii] + 80) >= boxes[jj][0] and (x_min_text_only_h[ii] + 80) < boxes[jj][1] and y_cor_x_min_main_h[ii] >= boxes[jj][2] and y_cor_x_min_main_h[ii] < boxes[jj][3]:
+                        arg_text_con_h.append(jj)
+                        break
+            arg_arg_text_con = np.argsort(arg_text_con_h)
+            args_contours_h = np.array(range(len(arg_text_con_h)))
+
+            order_by_con_head = np.zeros(len(arg_text_con_h))
+            order_by_con_main = np.zeros(len(arg_text_con))
+
+            ref_point = 0
+            order_of_texts_tot = []
+            id_of_texts_tot = []
+            for iij in range(len(boxes)):
+
+                args_contours_box = args_contours[np.array(arg_text_con) == iij]
+                args_contours_box_h = args_contours_h[np.array(arg_text_con_h) == iij]
+                con_inter_box = []
+                con_inter_box_h = []
+
+                for i in range(len(args_contours_box)):
+                    con_inter_box.append(contours_only_text_parent[args_contours_box[i]])
+
+                for i in range(len(args_contours_box_h)):
+                    con_inter_box_h.append(contours_only_text_parent_h[args_contours_box_h[i]])
+
+                indexes_sorted, matrix_of_orders, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions(textline_mask_tot[int(boxes[iij][2]) : int(boxes[iij][3]), int(boxes[iij][0]) : int(boxes[iij][1])], con_inter_box, con_inter_box_h, boxes[iij][2])
+
+                order_of_texts, id_of_texts = order_and_id_of_texts(con_inter_box, con_inter_box_h, matrix_of_orders, indexes_sorted, index_by_kind_sorted, kind_of_texts_sorted, ref_point)
+
+                indexes_sorted_main = np.array(indexes_sorted)[np.array(kind_of_texts_sorted) == 1]
+                indexes_by_type_main = np.array(index_by_kind_sorted)[np.array(kind_of_texts_sorted) == 1]
+                indexes_sorted_head = np.array(indexes_sorted)[np.array(kind_of_texts_sorted) == 2]
+                indexes_by_type_head = np.array(index_by_kind_sorted)[np.array(kind_of_texts_sorted) == 2]
+
+                zahler = 0
+                for mtv in args_contours_box:
+                    arg_order_v = indexes_sorted_main[zahler]
+                    tartib = np.where(indexes_sorted == arg_order_v)[0][0]
+                    order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = tartib + ref_point
+                    zahler = zahler + 1
+
+                zahler = 0
+                for mtv in args_contours_box_h:
+                    arg_order_v = indexes_sorted_head[zahler]
+                    tartib = np.where(indexes_sorted == arg_order_v)[0][0]
+                    # print(indexes_sorted,np.where(indexes_sorted==arg_order_v ),arg_order_v,tartib,'inshgalla')
+                    order_by_con_head[args_contours_box_h[indexes_by_type_head[zahler]]] = tartib + ref_point
+                    zahler = zahler + 1
+
+                for jji in range(len(id_of_texts)):
+                    order_of_texts_tot.append(order_of_texts[jji] + ref_point)
+                    id_of_texts_tot.append(id_of_texts[jji])
+                ref_point = ref_point + len(id_of_texts)
+
+            order_of_texts_tot = []
+            for tj1 in range(len(contours_only_text_parent)):
+                order_of_texts_tot.append(int(order_by_con_main[tj1]))
+
+            for tj1 in range(len(contours_only_text_parent_h)):
+                order_of_texts_tot.append(int(order_by_con_head[tj1]))
+
+            order_text_new = []
+            for iii in range(len(order_of_texts_tot)):
+                tartib_new = np.where(np.array(order_of_texts_tot) == iii)[0][0]
+                order_text_new.append(tartib_new)
+
+        except:
+            arg_text_con = []
+            for ii in range(len(cx_text_only)):
+                for jj in range(len(boxes)):
+                    if cx_text_only[ii] >= boxes[jj][0] and cx_text_only[ii] < boxes[jj][1] and cy_text_only[ii] >= boxes[jj][2] and cy_text_only[ii] < boxes[jj][3]:  # this is valid if the center of region identify in which box it is located
+                        arg_text_con.append(jj)
+                        break
+            arg_arg_text_con = np.argsort(arg_text_con)
+            args_contours = np.array(range(len(arg_text_con)))
+
+            order_by_con_main = np.zeros(len(arg_text_con))
+
+            ############################# head
+
+            arg_text_con_h = []
+            for ii in range(len(cx_text_only_h)):
+                for jj in range(len(boxes)):
+                    if cx_text_only_h[ii] >= boxes[jj][0] and cx_text_only_h[ii] < boxes[jj][1] and cy_text_only_h[ii] >= boxes[jj][2] and cy_text_only_h[ii] < boxes[jj][3]:  # this is valid if the center of region identify in which box it is located
+                        arg_text_con_h.append(jj)
+                        break
+            arg_arg_text_con_h = np.argsort(arg_text_con_h)
+            args_contours_h = np.array(range(len(arg_text_con_h)))
+
+            order_by_con_head = np.zeros(len(arg_text_con_h))
+
+            ref_point = 0
+            order_of_texts_tot = []
+            id_of_texts_tot = []
+            for iij in range(len(boxes)):
+                args_contours_box = args_contours[np.array(arg_text_con) == iij]
+                args_contours_box_h = args_contours_h[np.array(arg_text_con_h) == iij]
+                con_inter_box = []
+                con_inter_box_h = []
+
+                for i in range(len(args_contours_box)):
+
+                    con_inter_box.append(contours_only_text_parent[args_contours_box[i]])
+                for i in range(len(args_contours_box_h)):
+
+                    con_inter_box_h.append(contours_only_text_parent_h[args_contours_box_h[i]])
+
+                indexes_sorted, matrix_of_orders, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions(textline_mask_tot[int(boxes[iij][2]) : int(boxes[iij][3]), int(boxes[iij][0]) : int(boxes[iij][1])], con_inter_box, con_inter_box_h, boxes[iij][2])
+
+                order_of_texts, id_of_texts = order_and_id_of_texts(con_inter_box, con_inter_box_h, matrix_of_orders, indexes_sorted, index_by_kind_sorted, kind_of_texts_sorted, ref_point)
+
+                indexes_sorted_main = np.array(indexes_sorted)[np.array(kind_of_texts_sorted) == 1]
+                indexes_by_type_main = np.array(index_by_kind_sorted)[np.array(kind_of_texts_sorted) == 1]
+                indexes_sorted_head = np.array(indexes_sorted)[np.array(kind_of_texts_sorted) == 2]
+                indexes_by_type_head = np.array(index_by_kind_sorted)[np.array(kind_of_texts_sorted) == 2]
+
+                zahler = 0
+                for mtv in args_contours_box:
+                    arg_order_v = indexes_sorted_main[zahler]
+                    tartib = np.where(indexes_sorted == arg_order_v)[0][0]
+                    order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = tartib + ref_point
+                    zahler = zahler + 1
+
+                zahler = 0
+                for mtv in args_contours_box_h:
+                    arg_order_v = indexes_sorted_head[zahler]
+                    tartib = np.where(indexes_sorted == arg_order_v)[0][0]
+                    # print(indexes_sorted,np.where(indexes_sorted==arg_order_v ),arg_order_v,tartib,'inshgalla')
+                    order_by_con_head[args_contours_box_h[indexes_by_type_head[zahler]]] = tartib + ref_point
+                    zahler = zahler + 1
+
+                for jji in range(len(id_of_texts)):
+                    order_of_texts_tot.append(order_of_texts[jji] + ref_point)
+                    id_of_texts_tot.append(id_of_texts[jji])
+                ref_point = ref_point + len(id_of_texts)
+
+            order_of_texts_tot = []
+            for tj1 in range(len(contours_only_text_parent)):
+                order_of_texts_tot.append(int(order_by_con_main[tj1]))
+
+            for tj1 in range(len(contours_only_text_parent_h)):
+                order_of_texts_tot.append(int(order_by_con_head[tj1]))
+
+            order_text_new = []
+            for iii in range(len(order_of_texts_tot)):
+                tartib_new = np.where(np.array(order_of_texts_tot) == iii)[0][0]
+                order_text_new.append(tartib_new)
+        return order_text_new, id_of_texts_tot
+
+    def do_order_of_regions_no_full_layout(self, contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot):
+        self.logger.debug("enter do_order_of_regions_no_full_layout")
+        cx_text_only, cy_text_only, x_min_text_only, _, _, _, y_cor_x_min_main = find_new_features_of_contoures(contours_only_text_parent)
+
+        try:
+            arg_text_con = []
+            for ii in range(len(cx_text_only)):
+                for jj in range(len(boxes)):
+                    if (x_min_text_only[ii] + 80) >= boxes[jj][0] and (x_min_text_only[ii] + 80) < boxes[jj][1] and y_cor_x_min_main[ii] >= boxes[jj][2] and y_cor_x_min_main[ii] < boxes[jj][3]:
+                        arg_text_con.append(jj)
+                        break
+            arg_arg_text_con = np.argsort(arg_text_con)
+            args_contours = np.array(range(len(arg_text_con)))
+
+            order_by_con_main = np.zeros(len(arg_text_con))
+
+            ref_point = 0
+            order_of_texts_tot = []
+            id_of_texts_tot = []
+            for iij in range(len(boxes)):
+
+                args_contours_box = args_contours[np.array(arg_text_con) == iij]
+
+                con_inter_box = []
+                con_inter_box_h = []
+
+                for i in range(len(args_contours_box)):
+                    con_inter_box.append(contours_only_text_parent[args_contours_box[i]])
+
+                indexes_sorted, matrix_of_orders, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions(textline_mask_tot[int(boxes[iij][2]) : int(boxes[iij][3]), int(boxes[iij][0]) : int(boxes[iij][1])], con_inter_box, con_inter_box_h, boxes[iij][2])
+
+                order_of_texts, id_of_texts = order_and_id_of_texts(con_inter_box, con_inter_box_h, matrix_of_orders, indexes_sorted, index_by_kind_sorted, kind_of_texts_sorted, ref_point)
+
+                indexes_sorted_main = np.array(indexes_sorted)[np.array(kind_of_texts_sorted) == 1]
+                indexes_by_type_main = np.array(index_by_kind_sorted)[np.array(kind_of_texts_sorted) == 1]
+                indexes_sorted_head = np.array(indexes_sorted)[np.array(kind_of_texts_sorted) == 2]
+                indexes_by_type_head = np.array(index_by_kind_sorted)[np.array(kind_of_texts_sorted) == 2]
+
+                zahler = 0
+                for mtv in args_contours_box:
+                    arg_order_v = indexes_sorted_main[zahler]
+                    tartib = np.where(indexes_sorted == arg_order_v)[0][0]
+                    order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = tartib + ref_point
+                    zahler = zahler + 1
+
+                for jji in range(len(id_of_texts)):
+                    order_of_texts_tot.append(order_of_texts[jji] + ref_point)
+                    id_of_texts_tot.append(id_of_texts[jji])
+                ref_point = ref_point + len(id_of_texts)
+
+            order_of_texts_tot = []
+            for tj1 in range(len(contours_only_text_parent)):
+                order_of_texts_tot.append(int(order_by_con_main[tj1]))
+
+            order_text_new = []
+            for iii in range(len(order_of_texts_tot)):
+                tartib_new = np.where(np.array(order_of_texts_tot) == iii)[0][0]
+                order_text_new.append(tartib_new)
+
+        except:
+            arg_text_con = []
+            for ii in range(len(cx_text_only)):
+                for jj in range(len(boxes)):
+                    if cx_text_only[ii] >= boxes[jj][0] and cx_text_only[ii] < boxes[jj][1] and cy_text_only[ii] >= boxes[jj][2] and cy_text_only[ii] < boxes[jj][3]:  # this is valid if the center of region identify in which box it is located
+                        arg_text_con.append(jj)
+                        break
+            arg_arg_text_con = np.argsort(arg_text_con)
+            args_contours = np.array(range(len(arg_text_con)))
+
+            order_by_con_main = np.zeros(len(arg_text_con))
+
+            ref_point = 0
+            order_of_texts_tot = []
+            id_of_texts_tot = []
+            for iij in range(len(boxes)):
+                args_contours_box = args_contours[np.array(arg_text_con) == iij]
+                con_inter_box = []
+                con_inter_box_h = []
+
+                for i in range(len(args_contours_box)):
+
+                    con_inter_box.append(contours_only_text_parent[args_contours_box[i]])
+
+                indexes_sorted, matrix_of_orders, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions(textline_mask_tot[int(boxes[iij][2]) : int(boxes[iij][3]), int(boxes[iij][0]) : int(boxes[iij][1])], con_inter_box, con_inter_box_h, boxes[iij][2])
+
+                order_of_texts, id_of_texts = order_and_id_of_texts(con_inter_box, con_inter_box_h, matrix_of_orders, indexes_sorted, index_by_kind_sorted, kind_of_texts_sorted, ref_point)
+
+                indexes_sorted_main = np.array(indexes_sorted)[np.array(kind_of_texts_sorted) == 1]
+                indexes_by_type_main = np.array(index_by_kind_sorted)[np.array(kind_of_texts_sorted) == 1]
+                indexes_sorted_head = np.array(indexes_sorted)[np.array(kind_of_texts_sorted) == 2]
+                indexes_by_type_head = np.array(index_by_kind_sorted)[np.array(kind_of_texts_sorted) == 2]
+
+                zahler = 0
+                for mtv in args_contours_box:
+                    arg_order_v = indexes_sorted_main[zahler]
+                    tartib = np.where(indexes_sorted == arg_order_v)[0][0]
+                    order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = tartib + ref_point
+                    zahler = zahler + 1
+
+                for jji in range(len(id_of_texts)):
+                    order_of_texts_tot.append(order_of_texts[jji] + ref_point)
+                    id_of_texts_tot.append(id_of_texts[jji])
+                ref_point = ref_point + len(id_of_texts)
+
+            order_of_texts_tot = []
+            for tj1 in range(len(contours_only_text_parent)):
+                order_of_texts_tot.append(int(order_by_con_main[tj1]))
+
+            order_text_new = []
+            for iii in range(len(order_of_texts_tot)):
+                tartib_new = np.where(np.array(order_of_texts_tot) == iii)[0][0]
+                order_text_new.append(tartib_new)
+
+        return order_text_new, id_of_texts_tot
+
+    def do_order_of_regions(self, *args, **kwargs):
         if self.full_layout:
-            cx_text_only, cy_text_only, x_min_text_only, _, _, _, y_cor_x_min_main = find_new_features_of_contoures(contours_only_text_parent)
-            cx_text_only_h, cy_text_only_h, x_min_text_only_h, _, _, _, y_cor_x_min_main_h = find_new_features_of_contoures(contours_only_text_parent_h)
-
-            try:
-                arg_text_con = []
-                for ii in range(len(cx_text_only)):
-                    for jj in range(len(boxes)):
-                        if (x_min_text_only[ii] + 80) >= boxes[jj][0] and (x_min_text_only[ii] + 80) < boxes[jj][1] and y_cor_x_min_main[ii] >= boxes[jj][2] and y_cor_x_min_main[ii] < boxes[jj][3]:
-                            arg_text_con.append(jj)
-                            break
-                arg_arg_text_con = np.argsort(arg_text_con)
-                args_contours = np.array(range(len(arg_text_con)))
-
-                arg_text_con_h = []
-                for ii in range(len(cx_text_only_h)):
-                    for jj in range(len(boxes)):
-                        if (x_min_text_only_h[ii] + 80) >= boxes[jj][0] and (x_min_text_only_h[ii] + 80) < boxes[jj][1] and y_cor_x_min_main_h[ii] >= boxes[jj][2] and y_cor_x_min_main_h[ii] < boxes[jj][3]:
-                            arg_text_con_h.append(jj)
-                            break
-                arg_arg_text_con = np.argsort(arg_text_con_h)
-                args_contours_h = np.array(range(len(arg_text_con_h)))
-
-                order_by_con_head = np.zeros(len(arg_text_con_h))
-                order_by_con_main = np.zeros(len(arg_text_con))
-
-                ref_point = 0
-                order_of_texts_tot = []
-                id_of_texts_tot = []
-                for iij in range(len(boxes)):
-
-                    args_contours_box = args_contours[np.array(arg_text_con) == iij]
-                    args_contours_box_h = args_contours_h[np.array(arg_text_con_h) == iij]
-                    con_inter_box = []
-                    con_inter_box_h = []
-
-                    for i in range(len(args_contours_box)):
-                        con_inter_box.append(contours_only_text_parent[args_contours_box[i]])
-
-                    for i in range(len(args_contours_box_h)):
-                        con_inter_box_h.append(contours_only_text_parent_h[args_contours_box_h[i]])
-
-                    indexes_sorted, matrix_of_orders, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions(textline_mask_tot[int(boxes[iij][2]) : int(boxes[iij][3]), int(boxes[iij][0]) : int(boxes[iij][1])], con_inter_box, con_inter_box_h, boxes[iij][2])
-
-                    order_of_texts, id_of_texts = order_and_id_of_texts(con_inter_box, con_inter_box_h, matrix_of_orders, indexes_sorted, index_by_kind_sorted, kind_of_texts_sorted, ref_point)
-
-                    indexes_sorted_main = np.array(indexes_sorted)[np.array(kind_of_texts_sorted) == 1]
-                    indexes_by_type_main = np.array(index_by_kind_sorted)[np.array(kind_of_texts_sorted) == 1]
-                    indexes_sorted_head = np.array(indexes_sorted)[np.array(kind_of_texts_sorted) == 2]
-                    indexes_by_type_head = np.array(index_by_kind_sorted)[np.array(kind_of_texts_sorted) == 2]
-
-                    zahler = 0
-                    for mtv in args_contours_box:
-                        arg_order_v = indexes_sorted_main[zahler]
-                        tartib = np.where(indexes_sorted == arg_order_v)[0][0]
-                        order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = tartib + ref_point
-                        zahler = zahler + 1
-
-                    zahler = 0
-                    for mtv in args_contours_box_h:
-                        arg_order_v = indexes_sorted_head[zahler]
-                        tartib = np.where(indexes_sorted == arg_order_v)[0][0]
-                        # print(indexes_sorted,np.where(indexes_sorted==arg_order_v ),arg_order_v,tartib,'inshgalla')
-                        order_by_con_head[args_contours_box_h[indexes_by_type_head[zahler]]] = tartib + ref_point
-                        zahler = zahler + 1
-
-                    for jji in range(len(id_of_texts)):
-                        order_of_texts_tot.append(order_of_texts[jji] + ref_point)
-                        id_of_texts_tot.append(id_of_texts[jji])
-                    ref_point = ref_point + len(id_of_texts)
-
-                order_of_texts_tot = []
-                for tj1 in range(len(contours_only_text_parent)):
-                    order_of_texts_tot.append(int(order_by_con_main[tj1]))
-
-                for tj1 in range(len(contours_only_text_parent_h)):
-                    order_of_texts_tot.append(int(order_by_con_head[tj1]))
-
-                order_text_new = []
-                for iii in range(len(order_of_texts_tot)):
-                    tartib_new = np.where(np.array(order_of_texts_tot) == iii)[0][0]
-                    order_text_new.append(tartib_new)
-
-            except:
-                arg_text_con = []
-                for ii in range(len(cx_text_only)):
-                    for jj in range(len(boxes)):
-                        if cx_text_only[ii] >= boxes[jj][0] and cx_text_only[ii] < boxes[jj][1] and cy_text_only[ii] >= boxes[jj][2] and cy_text_only[ii] < boxes[jj][3]:  # this is valid if the center of region identify in which box it is located
-                            arg_text_con.append(jj)
-                            break
-                arg_arg_text_con = np.argsort(arg_text_con)
-                args_contours = np.array(range(len(arg_text_con)))
-
-                order_by_con_main = np.zeros(len(arg_text_con))
-
-                ############################# head
-
-                arg_text_con_h = []
-                for ii in range(len(cx_text_only_h)):
-                    for jj in range(len(boxes)):
-                        if cx_text_only_h[ii] >= boxes[jj][0] and cx_text_only_h[ii] < boxes[jj][1] and cy_text_only_h[ii] >= boxes[jj][2] and cy_text_only_h[ii] < boxes[jj][3]:  # this is valid if the center of region identify in which box it is located
-                            arg_text_con_h.append(jj)
-                            break
-                arg_arg_text_con_h = np.argsort(arg_text_con_h)
-                args_contours_h = np.array(range(len(arg_text_con_h)))
-
-                order_by_con_head = np.zeros(len(arg_text_con_h))
-
-                ref_point = 0
-                order_of_texts_tot = []
-                id_of_texts_tot = []
-                for iij in range(len(boxes)):
-                    args_contours_box = args_contours[np.array(arg_text_con) == iij]
-                    args_contours_box_h = args_contours_h[np.array(arg_text_con_h) == iij]
-                    con_inter_box = []
-                    con_inter_box_h = []
-
-                    for i in range(len(args_contours_box)):
-
-                        con_inter_box.append(contours_only_text_parent[args_contours_box[i]])
-                    for i in range(len(args_contours_box_h)):
-
-                        con_inter_box_h.append(contours_only_text_parent_h[args_contours_box_h[i]])
-
-                    indexes_sorted, matrix_of_orders, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions(textline_mask_tot[int(boxes[iij][2]) : int(boxes[iij][3]), int(boxes[iij][0]) : int(boxes[iij][1])], con_inter_box, con_inter_box_h, boxes[iij][2])
-
-                    order_of_texts, id_of_texts = order_and_id_of_texts(con_inter_box, con_inter_box_h, matrix_of_orders, indexes_sorted, index_by_kind_sorted, kind_of_texts_sorted, ref_point)
-
-                    indexes_sorted_main = np.array(indexes_sorted)[np.array(kind_of_texts_sorted) == 1]
-                    indexes_by_type_main = np.array(index_by_kind_sorted)[np.array(kind_of_texts_sorted) == 1]
-                    indexes_sorted_head = np.array(indexes_sorted)[np.array(kind_of_texts_sorted) == 2]
-                    indexes_by_type_head = np.array(index_by_kind_sorted)[np.array(kind_of_texts_sorted) == 2]
-
-                    zahler = 0
-                    for mtv in args_contours_box:
-                        arg_order_v = indexes_sorted_main[zahler]
-                        tartib = np.where(indexes_sorted == arg_order_v)[0][0]
-                        order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = tartib + ref_point
-                        zahler = zahler + 1
-
-                    zahler = 0
-                    for mtv in args_contours_box_h:
-                        arg_order_v = indexes_sorted_head[zahler]
-                        tartib = np.where(indexes_sorted == arg_order_v)[0][0]
-                        # print(indexes_sorted,np.where(indexes_sorted==arg_order_v ),arg_order_v,tartib,'inshgalla')
-                        order_by_con_head[args_contours_box_h[indexes_by_type_head[zahler]]] = tartib + ref_point
-                        zahler = zahler + 1
-
-                    for jji in range(len(id_of_texts)):
-                        order_of_texts_tot.append(order_of_texts[jji] + ref_point)
-                        id_of_texts_tot.append(id_of_texts[jji])
-                    ref_point = ref_point + len(id_of_texts)
-
-                order_of_texts_tot = []
-                for tj1 in range(len(contours_only_text_parent)):
-                    order_of_texts_tot.append(int(order_by_con_main[tj1]))
-
-                for tj1 in range(len(contours_only_text_parent_h)):
-                    order_of_texts_tot.append(int(order_by_con_head[tj1]))
-
-                order_text_new = []
-                for iii in range(len(order_of_texts_tot)):
-                    tartib_new = np.where(np.array(order_of_texts_tot) == iii)[0][0]
-                    order_text_new.append(tartib_new)
-            return order_text_new, id_of_texts_tot
-
-        else:
-            cx_text_only, cy_text_only, x_min_text_only, _, _, _, y_cor_x_min_main = find_new_features_of_contoures(contours_only_text_parent)
-
-            try:
-                arg_text_con = []
-                for ii in range(len(cx_text_only)):
-                    for jj in range(len(boxes)):
-                        if (x_min_text_only[ii] + 80) >= boxes[jj][0] and (x_min_text_only[ii] + 80) < boxes[jj][1] and y_cor_x_min_main[ii] >= boxes[jj][2] and y_cor_x_min_main[ii] < boxes[jj][3]:
-                            arg_text_con.append(jj)
-                            break
-                arg_arg_text_con = np.argsort(arg_text_con)
-                args_contours = np.array(range(len(arg_text_con)))
-
-                order_by_con_main = np.zeros(len(arg_text_con))
-
-                ref_point = 0
-                order_of_texts_tot = []
-                id_of_texts_tot = []
-                for iij in range(len(boxes)):
-
-                    args_contours_box = args_contours[np.array(arg_text_con) == iij]
-
-                    con_inter_box = []
-                    con_inter_box_h = []
-
-                    for i in range(len(args_contours_box)):
-                        con_inter_box.append(contours_only_text_parent[args_contours_box[i]])
-
-                    indexes_sorted, matrix_of_orders, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions(textline_mask_tot[int(boxes[iij][2]) : int(boxes[iij][3]), int(boxes[iij][0]) : int(boxes[iij][1])], con_inter_box, con_inter_box_h, boxes[iij][2])
-
-                    order_of_texts, id_of_texts = order_and_id_of_texts(con_inter_box, con_inter_box_h, matrix_of_orders, indexes_sorted, index_by_kind_sorted, kind_of_texts_sorted, ref_point)
-
-                    indexes_sorted_main = np.array(indexes_sorted)[np.array(kind_of_texts_sorted) == 1]
-                    indexes_by_type_main = np.array(index_by_kind_sorted)[np.array(kind_of_texts_sorted) == 1]
-                    indexes_sorted_head = np.array(indexes_sorted)[np.array(kind_of_texts_sorted) == 2]
-                    indexes_by_type_head = np.array(index_by_kind_sorted)[np.array(kind_of_texts_sorted) == 2]
-
-                    zahler = 0
-                    for mtv in args_contours_box:
-                        arg_order_v = indexes_sorted_main[zahler]
-                        tartib = np.where(indexes_sorted == arg_order_v)[0][0]
-                        order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = tartib + ref_point
-                        zahler = zahler + 1
-
-                    for jji in range(len(id_of_texts)):
-                        order_of_texts_tot.append(order_of_texts[jji] + ref_point)
-                        id_of_texts_tot.append(id_of_texts[jji])
-                    ref_point = ref_point + len(id_of_texts)
-
-                order_of_texts_tot = []
-                for tj1 in range(len(contours_only_text_parent)):
-                    order_of_texts_tot.append(int(order_by_con_main[tj1]))
-
-                order_text_new = []
-                for iii in range(len(order_of_texts_tot)):
-                    tartib_new = np.where(np.array(order_of_texts_tot) == iii)[0][0]
-                    order_text_new.append(tartib_new)
-
-            except:
-                arg_text_con = []
-                for ii in range(len(cx_text_only)):
-                    for jj in range(len(boxes)):
-                        if cx_text_only[ii] >= boxes[jj][0] and cx_text_only[ii] < boxes[jj][1] and cy_text_only[ii] >= boxes[jj][2] and cy_text_only[ii] < boxes[jj][3]:  # this is valid if the center of region identify in which box it is located
-                            arg_text_con.append(jj)
-                            break
-                arg_arg_text_con = np.argsort(arg_text_con)
-                args_contours = np.array(range(len(arg_text_con)))
-
-                order_by_con_main = np.zeros(len(arg_text_con))
-
-                ref_point = 0
-                order_of_texts_tot = []
-                id_of_texts_tot = []
-                for iij in range(len(boxes)):
-                    args_contours_box = args_contours[np.array(arg_text_con) == iij]
-                    con_inter_box = []
-                    con_inter_box_h = []
-
-                    for i in range(len(args_contours_box)):
-
-                        con_inter_box.append(contours_only_text_parent[args_contours_box[i]])
-
-                    indexes_sorted, matrix_of_orders, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions(textline_mask_tot[int(boxes[iij][2]) : int(boxes[iij][3]), int(boxes[iij][0]) : int(boxes[iij][1])], con_inter_box, con_inter_box_h, boxes[iij][2])
-
-                    order_of_texts, id_of_texts = order_and_id_of_texts(con_inter_box, con_inter_box_h, matrix_of_orders, indexes_sorted, index_by_kind_sorted, kind_of_texts_sorted, ref_point)
-
-                    indexes_sorted_main = np.array(indexes_sorted)[np.array(kind_of_texts_sorted) == 1]
-                    indexes_by_type_main = np.array(index_by_kind_sorted)[np.array(kind_of_texts_sorted) == 1]
-                    indexes_sorted_head = np.array(indexes_sorted)[np.array(kind_of_texts_sorted) == 2]
-                    indexes_by_type_head = np.array(index_by_kind_sorted)[np.array(kind_of_texts_sorted) == 2]
-
-                    zahler = 0
-                    for mtv in args_contours_box:
-                        arg_order_v = indexes_sorted_main[zahler]
-                        tartib = np.where(indexes_sorted == arg_order_v)[0][0]
-                        order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = tartib + ref_point
-                        zahler = zahler + 1
-
-                    for jji in range(len(id_of_texts)):
-                        order_of_texts_tot.append(order_of_texts[jji] + ref_point)
-                        id_of_texts_tot.append(id_of_texts[jji])
-                    ref_point = ref_point + len(id_of_texts)
-
-                order_of_texts_tot = []
-                for tj1 in range(len(contours_only_text_parent)):
-                    order_of_texts_tot.append(int(order_by_con_main[tj1]))
-
-                order_text_new = []
-                for iii in range(len(order_of_texts_tot)):
-                    tartib_new = np.where(np.array(order_of_texts_tot) == iii)[0][0]
-                    order_text_new.append(tartib_new)
-
-            return order_text_new, id_of_texts_tot
+            return self.do_order_of_regions_full_layout(*args, **kwargs)
+        return self.do_order_of_regions_no_full_layout(*args, **kwargs)
 
     def run(self):
+        """
+        Get image and scales, then extract the page of scanned image
+        """
         self.logger.debug("enter run")
         is_image_enhanced = False
-        # get image and sclaes, then extract the page of scanned image
         t1 = time.time()
 
         ##########
@@ -2230,7 +2252,7 @@ class eynollah:
                 #print(np.unique(textline_mask_tot_ea[:, :]), "textline")
                 if self.plotter:
                     self.plotter.save_plot_of_textlines(textline_mask_tot_ea, image_page)
-                print("textline: " + str(time.time() - t1))
+                self.logger.info("textline detection took %ss", str(time.time() - t1))
                 # plt.imshow(textline_mask_tot_ea)
                 # plt.show()
                 # sys.exit()
@@ -2243,12 +2265,12 @@ class eynollah:
                 if self.plotter:
                     self.plotter.save_deskewed_image(slope_deskew)
                 # img_rotated=rotyate_image_different(self.image_org,slope_deskew)
-                print(slope_deskew, "slope_deskew")
+                self.logger.info("slope_deskew: %s", slope_deskew)
 
                 ##plt.imshow(img_rotated)
                 ##plt.show()
                 ##sys.exit()
-                print("deskewing: " + str(time.time() - t1))
+                self.logger.info("deskewing: " + str(time.time() - t1))
 
                 image_page_rotated, textline_mask_tot = image_page[:, :], textline_mask_tot_ea[:, :]
                 textline_mask_tot[mask_images[:, :] == 1] = 0
@@ -2278,7 +2300,7 @@ class eynollah:
                     self.plotter.save_plot_of_layout_main_all(text_regions_p, image_page)
                     self.plotter.save_plot_of_layout_main(text_regions_p, image_page)
 
-                print("marginals: " + str(time.time() - t1))
+                self.logger.info("detection of marginals took %ss", str(time.time() - t1))
 
                 if not self.full_layout:
 
@@ -2298,8 +2320,7 @@ class eynollah:
                     K.clear_session()
                     gc.collect()
 
-                    # print(peaks_neg_fin,num_col,'num_col2')
-                    print(num_col_classifier, "num_col_classifier")
+                    self.logger.info("num_col_classifier: %s", num_col_classifier)
 
                     if num_col_classifier >= 3:
                         if np.abs(slope_deskew) < SLOPE_THRESHOLD:
@@ -2323,9 +2344,8 @@ class eynollah:
                     else:
                         boxes_d = return_boxes_of_images_by_order_of_reading_new(spliter_y_new_d, regions_without_seperators_d, matrix_of_lines_ch_d, num_col_classifier)
 
-                    # print(len(boxes),'boxes')
-                    # sys.exit()
-                    print("boxes in: " + str(time.time() - t1))
+                    self.logger.debug("len(boxes): %s", len(boxes))
+                    self.logger.info("detecting boxes took %ss", str(time.time() - t1))
                     img_revised_tab = text_regions_p[:, :]
                     pixel_img = 2
                     polygons_of_images = return_contours_of_interested_region(img_revised_tab, pixel_img)
@@ -2412,7 +2432,7 @@ class eynollah:
                     K.clear_session()
                     gc.collect()
                     img_revised_tab = np.copy(text_regions_p[:, :])
-                    print("full layout in: " + str(time.time() - t1))
+                    self.logger.info("detection of full layout took %ss", str(time.time() - t1))
                     pixel_img = 5
                     polygons_of_images = return_contours_of_interested_region(img_revised_tab, pixel_img)
 
@@ -2638,7 +2658,7 @@ class eynollah:
                     self.write_into_page_xml_full(contours_only_text_parent, contours_only_text_parent_h, page_coord, self.dir_out, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, polygons_of_images, polygons_of_tabels, polygons_of_drop_capitals, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals)
                 else:
                     contours_only_text_parent_h = None
-                    # print('bura galmir?')
+                    # self.logger.debug('bura galmir?')
                     if np.abs(slope_deskew) < SLOPE_THRESHOLD:
                         #contours_only_text_parent = list(np.array(contours_only_text_parent)[index_by_text_par_con])
                         order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot)
@@ -2648,4 +2668,4 @@ class eynollah:
                     # order_text_new , id_of_texts_tot=self.do_order_of_regions(contours_only_text_parent,contours_only_text_parent_h,boxes,textline_mask_tot)
                     self.write_into_page_xml(txt_con_org, page_coord, self.dir_out, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, self.curved_line, slopes, slopes_marginals)
 
-        print("Job done in: " + str(time.time() - t1))
+        self.logger.info("Job done in %ss", str(time.time() - t1))

From 70d0b985a9693542fbecab58c5c9d6f0a8a3f34b Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Thu, 4 Feb 2021 19:59:28 +0100
Subject: [PATCH 31/89] cli: add a --log-level option

---
 sbb_newspapers_org_image/cli.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/sbb_newspapers_org_image/cli.py b/sbb_newspapers_org_image/cli.py
index 75e148e..10bf5e8 100644
--- a/sbb_newspapers_org_image/cli.py
+++ b/sbb_newspapers_org_image/cli.py
@@ -1,5 +1,5 @@
 import click
-from ocrd_utils import initLogging
+from ocrd_utils import initLogging, setOverrideLogLevel
 from sbb_newspapers_org_image.eynollah import eynollah
 
 
@@ -84,6 +84,12 @@ from sbb_newspapers_org_image.eynollah import eynollah
     is_flag=True,
     help="if this parameter set to true, this tool would ignore headers role in reading order",
 )
+@click.option(
+    "--log-level",
+    "-l",
+    type=click.Choice(['OFF', 'DEBUG', 'INFO', 'WARN', 'ERROR']),
+    help="Override log level globally to this",
+)
 def main(
     image,
     out,
@@ -98,7 +104,10 @@ def main(
     full_layout,
     allow_scaling,
     headers_off,
+    log_level
 ):
+    if log_level:
+        setOverrideLogLevel(log_level)
     initLogging()
     eynollah(
         image,

From 306acd361855570991fd5519caaea880471a9eff Mon Sep 17 00:00:00 2001
From: Clemens Neudecker <952378+cneud@users.noreply.github.com>
Date: Thu, 4 Feb 2021 23:48:46 +0100
Subject: [PATCH 32/89] replace `PrintSpace` with `Border`

fix https://github.com/qurator-spk/eynollah/issues/15
---
 sbb_newspapers_org_image/eynollah.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index 93d013a..1aad610 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -1284,7 +1284,7 @@ class eynollah:
         # create the file structure
         pcgts, page = create_page_xml(self.image_filename, self.height_org, self.width_org)
 
-        page_print_sub = ET.SubElement(page, "PrintSpace")
+        page_print_sub = ET.SubElement(page, "Border")
         coord_page = ET.SubElement(page_print_sub, "Coords")
         coord_page.set('points', self.calculate_page_coords())
 
@@ -1527,7 +1527,7 @@ class eynollah:
 
         # create the file structure
         pcgts, page = create_page_xml(self.image_filename, self.height_org, self.width_org)
-        page_print_sub = ET.SubElement(page, "PrintSpace")
+        page_print_sub = ET.SubElement(page, "Border")
         coord_page = ET.SubElement(page_print_sub, "Coords")
         coord_page.set('points', self.calculate_page_coords())
 

From 1dec9b87311226cf5600112b43a2ab2ecec0f585 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Fri, 5 Feb 2021 12:30:00 +0100
Subject: [PATCH 33/89] simplify if-else in run

---
 sbb_newspapers_org_image/eynollah.py | 798 +++++++++++++--------------
 1 file changed, 398 insertions(+), 400 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index 93d013a..bb8b12d 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -2240,432 +2240,430 @@ class eynollah:
             slopes_marginals = []
             self.write_into_page_xml(txt_con_org, page_coord, self.dir_out, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, self.curved_line, slopes, slopes_marginals)
         else:
-            # pass
-            if 1>0:#try:
-                patches = True
-                scaler_h_textline = 1  # 1.2#1.2
-                scaler_w_textline = 1  # 0.9#1
-                textline_mask_tot_ea, textline_mask_tot_long_shot = self.textline_contours(image_page, patches, scaler_h_textline, scaler_w_textline)
+            patches = True
+            scaler_h_textline = 1  # 1.2#1.2
+            scaler_w_textline = 1  # 0.9#1
+            textline_mask_tot_ea, textline_mask_tot_long_shot = self.textline_contours(image_page, patches, scaler_h_textline, scaler_w_textline)
 
+            K.clear_session()
+            gc.collect()
+            #print(np.unique(textline_mask_tot_ea[:, :]), "textline")
+            if self.plotter:
+                self.plotter.save_plot_of_textlines(textline_mask_tot_ea, image_page)
+            self.logger.info("textline detection took %ss", str(time.time() - t1))
+            # plt.imshow(textline_mask_tot_ea)
+            # plt.show()
+            # sys.exit()
+
+            sigma = 2
+            main_page_deskew = True
+            slope_deskew = return_deskew_slop(cv2.erode(textline_mask_tot_ea, self.kernel, iterations=2), sigma, main_page_deskew, plotter=self.plotter)
+            slope_first = 0  # return_deskew_slop(cv2.erode(textline_mask_tot_ea, self.kernel, iterations=2),sigma, plotter=self.plotter)
+
+            if self.plotter:
+                self.plotter.save_deskewed_image(slope_deskew)
+            # img_rotated=rotyate_image_different(self.image_org,slope_deskew)
+            self.logger.info("slope_deskew: %s", slope_deskew)
+
+            ##plt.imshow(img_rotated)
+            ##plt.show()
+            ##sys.exit()
+            self.logger.info("deskewing: " + str(time.time() - t1))
+
+            image_page_rotated, textline_mask_tot = image_page[:, :], textline_mask_tot_ea[:, :]
+            textline_mask_tot[mask_images[:, :] == 1] = 0
+
+            pixel_img = 1
+            min_area = 0.00001
+            max_area = 0.0006
+            textline_mask_tot_small_size = return_contours_of_interested_region_by_size(textline_mask_tot, pixel_img, min_area, max_area)
+            text_regions_p_1[mask_lines[:, :] == 1] = 3
+            text_regions_p = text_regions_p_1[:, :]  # long_short_region[:,:]#self.get_regions_from_2_models(image_page)
+            text_regions_p = np.array(text_regions_p)
+
+            if num_col_classifier == 1 or num_col_classifier == 2:
+                try:
+                    regions_without_seperators = (text_regions_p[:, :] == 1) * 1
+                    regions_without_seperators = regions_without_seperators.astype(np.uint8)
+
+                    text_regions_p = get_marginals(rotate_image(regions_without_seperators, slope_deskew), text_regions_p, num_col_classifier, slope_deskew, kernel=self.kernel)
+
+                except:
+                    pass
+
+            # plt.imshow(text_regions_p)
+            # plt.show()
+
+            if self.plotter:
+                self.plotter.save_plot_of_layout_main_all(text_regions_p, image_page)
+                self.plotter.save_plot_of_layout_main(text_regions_p, image_page)
+
+            self.logger.info("detection of marginals took %ss", str(time.time() - t1))
+
+            if not self.full_layout:
+
+                if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
+                    image_page_rotated_n, textline_mask_tot_d, text_regions_p_1_n = rotation_not_90_func(image_page, textline_mask_tot, text_regions_p, slope_deskew)
+                    text_regions_p_1_n = resize_image(text_regions_p_1_n, text_regions_p.shape[0], text_regions_p.shape[1])
+                    textline_mask_tot_d = resize_image(textline_mask_tot_d, text_regions_p.shape[0], text_regions_p.shape[1])
+                    regions_without_seperators_d = (text_regions_p_1_n[:, :] == 1) * 1
+                regions_without_seperators = (text_regions_p[:, :] == 1) * 1  # ( (text_regions_p[:,:]==1) | (text_regions_p[:,:]==2) )*1 #self.return_regions_without_seperators_new(text_regions_p[:,:,0],img_only_regions)
+
+                pixel_lines = 3
+                if np.abs(slope_deskew) < SLOPE_THRESHOLD:
+                    num_col, peaks_neg_fin, matrix_of_lines_ch, spliter_y_new, seperators_closeup_n = find_number_of_columns_in_document(np.repeat(text_regions_p[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines)
+
+                if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
+                    num_col_d, peaks_neg_fin_d, matrix_of_lines_ch_d, spliter_y_new_d, seperators_closeup_n_d = find_number_of_columns_in_document(np.repeat(text_regions_p_1_n[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines)
                 K.clear_session()
                 gc.collect()
-                #print(np.unique(textline_mask_tot_ea[:, :]), "textline")
-                if self.plotter:
-                    self.plotter.save_plot_of_textlines(textline_mask_tot_ea, image_page)
-                self.logger.info("textline detection took %ss", str(time.time() - t1))
-                # plt.imshow(textline_mask_tot_ea)
-                # plt.show()
-                # sys.exit()
 
-                sigma = 2
-                main_page_deskew = True
-                slope_deskew = return_deskew_slop(cv2.erode(textline_mask_tot_ea, self.kernel, iterations=2), sigma, main_page_deskew, plotter=self.plotter)
-                slope_first = 0  # return_deskew_slop(cv2.erode(textline_mask_tot_ea, self.kernel, iterations=2),sigma, plotter=self.plotter)
+                self.logger.info("num_col_classifier: %s", num_col_classifier)
 
-                if self.plotter:
-                    self.plotter.save_deskewed_image(slope_deskew)
-                # img_rotated=rotyate_image_different(self.image_org,slope_deskew)
-                self.logger.info("slope_deskew: %s", slope_deskew)
-
-                ##plt.imshow(img_rotated)
-                ##plt.show()
-                ##sys.exit()
-                self.logger.info("deskewing: " + str(time.time() - t1))
-
-                image_page_rotated, textline_mask_tot = image_page[:, :], textline_mask_tot_ea[:, :]
-                textline_mask_tot[mask_images[:, :] == 1] = 0
-
-                pixel_img = 1
-                min_area = 0.00001
-                max_area = 0.0006
-                textline_mask_tot_small_size = return_contours_of_interested_region_by_size(textline_mask_tot, pixel_img, min_area, max_area)
-                text_regions_p_1[mask_lines[:, :] == 1] = 3
-                text_regions_p = text_regions_p_1[:, :]  # long_short_region[:,:]#self.get_regions_from_2_models(image_page)
-                text_regions_p = np.array(text_regions_p)
-
-                if num_col_classifier == 1 or num_col_classifier == 2:
-                    try:
-                        regions_without_seperators = (text_regions_p[:, :] == 1) * 1
+                if num_col_classifier >= 3:
+                    if np.abs(slope_deskew) < SLOPE_THRESHOLD:
                         regions_without_seperators = regions_without_seperators.astype(np.uint8)
-
-                        text_regions_p = get_marginals(rotate_image(regions_without_seperators, slope_deskew), text_regions_p, num_col_classifier, slope_deskew, kernel=self.kernel)
-
-                    except:
-                        pass
-
-                # plt.imshow(text_regions_p)
-                # plt.show()
-
-                if self.plotter:
-                    self.plotter.save_plot_of_layout_main_all(text_regions_p, image_page)
-                    self.plotter.save_plot_of_layout_main(text_regions_p, image_page)
-
-                self.logger.info("detection of marginals took %ss", str(time.time() - t1))
-
-                if not self.full_layout:
-
-                    if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
-                        image_page_rotated_n, textline_mask_tot_d, text_regions_p_1_n = rotation_not_90_func(image_page, textline_mask_tot, text_regions_p, slope_deskew)
-                        text_regions_p_1_n = resize_image(text_regions_p_1_n, text_regions_p.shape[0], text_regions_p.shape[1])
-                        textline_mask_tot_d = resize_image(textline_mask_tot_d, text_regions_p.shape[0], text_regions_p.shape[1])
-                        regions_without_seperators_d = (text_regions_p_1_n[:, :] == 1) * 1
-                    regions_without_seperators = (text_regions_p[:, :] == 1) * 1  # ( (text_regions_p[:,:]==1) | (text_regions_p[:,:]==2) )*1 #self.return_regions_without_seperators_new(text_regions_p[:,:,0],img_only_regions)
-
-                    pixel_lines = 3
-                    if np.abs(slope_deskew) < SLOPE_THRESHOLD:
-                        num_col, peaks_neg_fin, matrix_of_lines_ch, spliter_y_new, seperators_closeup_n = find_number_of_columns_in_document(np.repeat(text_regions_p[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines)
-
-                    if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
-                        num_col_d, peaks_neg_fin_d, matrix_of_lines_ch_d, spliter_y_new_d, seperators_closeup_n_d = find_number_of_columns_in_document(np.repeat(text_regions_p_1_n[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines)
-                    K.clear_session()
-                    gc.collect()
-
-                    self.logger.info("num_col_classifier: %s", num_col_classifier)
-
-                    if num_col_classifier >= 3:
-                        if np.abs(slope_deskew) < SLOPE_THRESHOLD:
-                            regions_without_seperators = regions_without_seperators.astype(np.uint8)
-                            regions_without_seperators = cv2.erode(regions_without_seperators[:, :], self.kernel, iterations=6)
-                            #random_pixels_for_image = np.random.randn(regions_without_seperators.shape[0], regions_without_seperators.shape[1])
-                            #random_pixels_for_image[random_pixels_for_image < -0.5] = 0
-                            #random_pixels_for_image[random_pixels_for_image != 0] = 1
-                            #regions_without_seperators[(random_pixels_for_image[:, :] == 1) & (text_regions_p[:, :] == 2)] = 1
-                        else:
-                            regions_without_seperators_d = regions_without_seperators_d.astype(np.uint8)
-                            regions_without_seperators_d = cv2.erode(regions_without_seperators_d[:, :], self.kernel, iterations=6)
-                            #random_pixels_for_image = np.random.randn(regions_without_seperators_d.shape[0], regions_without_seperators_d.shape[1])
-                            #random_pixels_for_image[random_pixels_for_image < -0.5] = 0
-                            #random_pixels_for_image[random_pixels_for_image != 0] = 1
-
-                            #regions_without_seperators_d[(random_pixels_for_image[:, :] == 1) & (text_regions_p_1_n[:, :] == 2)] = 1
-
-                    if np.abs(slope_deskew) < SLOPE_THRESHOLD:
-                        boxes = return_boxes_of_images_by_order_of_reading_new(spliter_y_new, regions_without_seperators, matrix_of_lines_ch, num_col_classifier)
+                        regions_without_seperators = cv2.erode(regions_without_seperators[:, :], self.kernel, iterations=6)
+                        #random_pixels_for_image = np.random.randn(regions_without_seperators.shape[0], regions_without_seperators.shape[1])
+                        #random_pixels_for_image[random_pixels_for_image < -0.5] = 0
+                        #random_pixels_for_image[random_pixels_for_image != 0] = 1
+                        #regions_without_seperators[(random_pixels_for_image[:, :] == 1) & (text_regions_p[:, :] == 2)] = 1
                     else:
-                        boxes_d = return_boxes_of_images_by_order_of_reading_new(spliter_y_new_d, regions_without_seperators_d, matrix_of_lines_ch_d, num_col_classifier)
+                        regions_without_seperators_d = regions_without_seperators_d.astype(np.uint8)
+                        regions_without_seperators_d = cv2.erode(regions_without_seperators_d[:, :], self.kernel, iterations=6)
+                        #random_pixels_for_image = np.random.randn(regions_without_seperators_d.shape[0], regions_without_seperators_d.shape[1])
+                        #random_pixels_for_image[random_pixels_for_image < -0.5] = 0
+                        #random_pixels_for_image[random_pixels_for_image != 0] = 1
 
-                    self.logger.debug("len(boxes): %s", len(boxes))
-                    self.logger.info("detecting boxes took %ss", str(time.time() - t1))
-                    img_revised_tab = text_regions_p[:, :]
-                    pixel_img = 2
-                    polygons_of_images = return_contours_of_interested_region(img_revised_tab, pixel_img)
+                        #regions_without_seperators_d[(random_pixels_for_image[:, :] == 1) & (text_regions_p_1_n[:, :] == 2)] = 1
 
-                    # plt.imshow(img_revised_tab)
-                    # plt.show()
-                    K.clear_session()
+                if np.abs(slope_deskew) < SLOPE_THRESHOLD:
+                    boxes = return_boxes_of_images_by_order_of_reading_new(spliter_y_new, regions_without_seperators, matrix_of_lines_ch, num_col_classifier)
+                else:
+                    boxes_d = return_boxes_of_images_by_order_of_reading_new(spliter_y_new_d, regions_without_seperators_d, matrix_of_lines_ch_d, num_col_classifier)
 
-                pixel_img = 4
-                min_area_mar = 0.00001
-                polygons_of_marginals = return_contours_of_interested_region(text_regions_p, pixel_img, min_area_mar)
-
-                if self.full_layout:
-                    # set first model with second model
-                    text_regions_p[:, :][text_regions_p[:, :] == 2] = 5
-                    text_regions_p[:, :][text_regions_p[:, :] == 3] = 6
-                    text_regions_p[:, :][text_regions_p[:, :] == 4] = 8
-
-                    K.clear_session()
-                    # gc.collect()
-                    patches = True
-                    image_page = image_page.astype(np.uint8)
-
-                    # print(type(image_page))
-                    regions_fully, regions_fully_only_drop = self.extract_text_regions(image_page, patches, cols=num_col_classifier)
-                    text_regions_p[:,:][regions_fully[:,:,0]==6]=6
-
-                    regions_fully_only_drop = put_drop_out_from_only_drop_model(regions_fully_only_drop, text_regions_p)
-                    regions_fully[:, :, 0][regions_fully_only_drop[:, :, 0] == 4] = 4
-                    K.clear_session()
-                    gc.collect()
-
-                    # plt.imshow(regions_fully[:,:,0])
-                    # plt.show()
-
-                    regions_fully = putt_bb_of_drop_capitals_of_model_in_patches_in_layout(regions_fully)
-
-                    # plt.imshow(regions_fully[:,:,0])
-                    # plt.show()
-
-                    K.clear_session()
-                    gc.collect()
-                    patches = False
-                    regions_fully_np, _ = self.extract_text_regions(image_page, patches, cols=num_col_classifier)
-
-                    # plt.imshow(regions_fully_np[:,:,0])
-                    # plt.show()
-
-                    if num_col_classifier > 2:
-                        regions_fully_np[:, :, 0][regions_fully_np[:, :, 0] == 4] = 0
-                    else:
-                        regions_fully_np = filter_small_drop_capitals_from_no_patch_layout(regions_fully_np, text_regions_p)
-
-                    # plt.imshow(regions_fully_np[:,:,0])
-                    # plt.show()
-
-                    K.clear_session()
-                    gc.collect()
-
-                    # plt.imshow(regions_fully[:,:,0])
-                    # plt.show()
-
-                    regions_fully = boosting_headers_by_longshot_region_segmentation(regions_fully, regions_fully_np, img_only_regions)
-
-                    # plt.imshow(regions_fully[:,:,0])
-                    # plt.show()
-
-                    text_regions_p[:, :][regions_fully[:, :, 0] == 4] = 4
-                    text_regions_p[:, :][regions_fully_np[:, :, 0] == 4] = 4
-
-                    #plt.imshow(text_regions_p)
-                    #plt.show()
-
-                    if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
-                        image_page_rotated_n, textline_mask_tot_d, text_regions_p_1_n, regions_fully_n = rotation_not_90_func_full_layout(image_page, textline_mask_tot, text_regions_p, regions_fully, slope_deskew)
-
-                        text_regions_p_1_n = resize_image(text_regions_p_1_n, text_regions_p.shape[0], text_regions_p.shape[1])
-                        textline_mask_tot_d = resize_image(textline_mask_tot_d, text_regions_p.shape[0], text_regions_p.shape[1])
-                        regions_fully_n = resize_image(regions_fully_n, text_regions_p.shape[0], text_regions_p.shape[1])
-                        regions_without_seperators_d = (text_regions_p_1_n[:, :] == 1) * 1
-
-                    regions_without_seperators = (text_regions_p[:, :] == 1) * 1  # ( (text_regions_p[:,:]==1) | (text_regions_p[:,:]==2) )*1 #self.return_regions_without_seperators_new(text_regions_p[:,:,0],img_only_regions)
-
-                    K.clear_session()
-                    gc.collect()
-                    img_revised_tab = np.copy(text_regions_p[:, :])
-                    self.logger.info("detection of full layout took %ss", str(time.time() - t1))
-                    pixel_img = 5
-                    polygons_of_images = return_contours_of_interested_region(img_revised_tab, pixel_img)
+                self.logger.debug("len(boxes): %s", len(boxes))
+                self.logger.info("detecting boxes took %ss", str(time.time() - t1))
+                img_revised_tab = text_regions_p[:, :]
+                pixel_img = 2
+                polygons_of_images = return_contours_of_interested_region(img_revised_tab, pixel_img)
 
                 # plt.imshow(img_revised_tab)
                 # plt.show()
+                K.clear_session()
 
-                # print(img_revised_tab.shape,text_regions_p_1_n.shape)
-                # text_regions_p_1_n=resize_image(text_regions_p_1_n,img_revised_tab.shape[0],img_revised_tab.shape[1])
-                # print(np.unique(text_regions_p_1_n),'uni')
+            pixel_img = 4
+            min_area_mar = 0.00001
+            polygons_of_marginals = return_contours_of_interested_region(text_regions_p, pixel_img, min_area_mar)
 
-                text_only = ((img_revised_tab[:, :] == 1)) * 1
-                if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
-                    text_only_d = ((text_regions_p_1_n[:, :] == 1)) * 1
-                ##text_only_h=( (img_revised_tab[:,:,0]==2) )*1
+            if self.full_layout:
+                # set first model with second model
+                text_regions_p[:, :][text_regions_p[:, :] == 2] = 5
+                text_regions_p[:, :][text_regions_p[:, :] == 3] = 6
+                text_regions_p[:, :][text_regions_p[:, :] == 4] = 8
 
-                # print(text_only.shape,text_only_d.shape)
-                # plt.imshow(text_only)
+                K.clear_session()
+                # gc.collect()
+                patches = True
+                image_page = image_page.astype(np.uint8)
+
+                # print(type(image_page))
+                regions_fully, regions_fully_only_drop = self.extract_text_regions(image_page, patches, cols=num_col_classifier)
+                text_regions_p[:,:][regions_fully[:,:,0]==6]=6
+
+                regions_fully_only_drop = put_drop_out_from_only_drop_model(regions_fully_only_drop, text_regions_p)
+                regions_fully[:, :, 0][regions_fully_only_drop[:, :, 0] == 4] = 4
+                K.clear_session()
+                gc.collect()
+
+                # plt.imshow(regions_fully[:,:,0])
                 # plt.show()
 
-                # plt.imshow(text_only_d)
+                regions_fully = putt_bb_of_drop_capitals_of_model_in_patches_in_layout(regions_fully)
+
+                # plt.imshow(regions_fully[:,:,0])
                 # plt.show()
 
-                min_con_area = 0.000005
-                if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
-                    contours_only_text, hir_on_text = return_contours_of_image(text_only)
-                    contours_only_text_parent = return_parent_contours(contours_only_text, hir_on_text)
-                    areas_cnt_text = np.array([cv2.contourArea(contours_only_text_parent[j]) for j in range(len(contours_only_text_parent))])
-                    areas_cnt_text = areas_cnt_text / float(text_only.shape[0] * text_only.shape[1])
-                    contours_biggest = contours_only_text_parent[np.argmax(areas_cnt_text)]
-                    contours_only_text_parent = [contours_only_text_parent[jz] for jz in range(len(contours_only_text_parent)) if areas_cnt_text[jz] > min_con_area]
-                    areas_cnt_text_parent = [areas_cnt_text[jz] for jz in range(len(areas_cnt_text)) if areas_cnt_text[jz] > min_con_area]
-
-                    index_con_parents = np.argsort(areas_cnt_text_parent)
-                    contours_only_text_parent = list(np.array(contours_only_text_parent)[index_con_parents])
-                    areas_cnt_text_parent = list(np.array(areas_cnt_text_parent)[index_con_parents])
-
-                    cx_bigest_big, cy_biggest_big, _, _, _, _, _ = find_new_features_of_contoures([contours_biggest])
-                    cx_bigest, cy_biggest, _, _, _, _, _ = find_new_features_of_contoures(contours_only_text_parent)
-
-                    contours_only_text_d, hir_on_text_d = return_contours_of_image(text_only_d)
-                    contours_only_text_parent_d = return_parent_contours(contours_only_text_d, hir_on_text_d)
-
-                    areas_cnt_text_d = np.array([cv2.contourArea(contours_only_text_parent_d[j]) for j in range(len(contours_only_text_parent_d))])
-                    areas_cnt_text_d = areas_cnt_text_d / float(text_only_d.shape[0] * text_only_d.shape[1])
-
-                    contours_biggest_d = contours_only_text_parent_d[np.argmax(areas_cnt_text_d)]
-                    index_con_parents_d=np.argsort(areas_cnt_text_d)
-                    contours_only_text_parent_d=list(np.array(contours_only_text_parent_d)[index_con_parents_d] )
-                    areas_cnt_text_d=list(np.array(areas_cnt_text_d)[index_con_parents_d] )
-
-                    cx_bigest_d_big, cy_biggest_d_big, _, _, _, _, _ = find_new_features_of_contoures([contours_biggest_d])
-                    cx_bigest_d, cy_biggest_d, _, _, _, _, _ = find_new_features_of_contoures(contours_only_text_parent_d)
-                    try:
-                        cx_bigest_d_last5=cx_bigest_d[-5:]
-                        cy_biggest_d_last5=cy_biggest_d[-5:]
-                        dists_d = [math.sqrt((cx_bigest_big[0]-cx_bigest_d_last5[j])**2 + (cy_biggest_big[0]-cy_biggest_d_last5[j])**2) for j in range(len(cy_biggest_d_last5))]
-                        ind_largest=len(cx_bigest_d)-5+np.argmin(dists_d)
-                        cx_bigest_d_big[0]=cx_bigest_d[ind_largest]
-                        cy_biggest_d_big[0]=cy_biggest_d[ind_largest]
-                    except:
-                        pass
-
-                    (h, w) = text_only.shape[:2]
-                    center = (w // 2.0, h // 2.0)
-                    M = cv2.getRotationMatrix2D(center, slope_deskew, 1.0)
-
-                    M_22 = np.array(M)[:2, :2]
-
-                    p_big = np.dot(M_22, [cx_bigest_big, cy_biggest_big])
-
-                    x_diff = p_big[0] - cx_bigest_d_big
-                    y_diff = p_big[1] - cy_biggest_d_big
-
-                    # print(p_big)
-                    # print(cx_bigest_d_big,cy_biggest_d_big)
-                    # print(x_diff,y_diff)
-
-                    contours_only_text_parent_d_ordered = []
-                    for i in range(len(contours_only_text_parent)):
-                        # img1=np.zeros((text_only.shape[0],text_only.shape[1],3))
-                        # img1=cv2.fillPoly(img1,pts=[contours_only_text_parent[i]] ,color=(1,1,1))
-                        # plt.imshow(img1[:,:,0])
-                        # plt.show()
-
-                        p = np.dot(M_22, [cx_bigest[i], cy_biggest[i]])
-                        # print(p)
-                        p[0] = p[0] - x_diff[0]
-                        p[1] = p[1] - y_diff[0]
-                        # print(p)
-                        # print(cx_bigest_d)
-                        # print(cy_biggest_d)
-                        dists = [math.sqrt((p[0] - cx_bigest_d[j]) ** 2 + (p[1] - cy_biggest_d[j]) ** 2) for j in range(len(cx_bigest_d))]
-                        # print(np.argmin(dists))
-                        contours_only_text_parent_d_ordered.append(contours_only_text_parent_d[np.argmin(dists)])
-                        # img2=np.zeros((text_only.shape[0],text_only.shape[1],3))
-                        # img2=cv2.fillPoly(img2,pts=[contours_only_text_parent_d[np.argmin(dists)]] ,color=(1,1,1))
-                        # plt.imshow(img2[:,:,0])
-                        # plt.show()
-                else:
-                    contours_only_text, hir_on_text = return_contours_of_image(text_only)
-                    contours_only_text_parent = return_parent_contours(contours_only_text, hir_on_text)
-
-                    areas_cnt_text = np.array([cv2.contourArea(contours_only_text_parent[j]) for j in range(len(contours_only_text_parent))])
-                    areas_cnt_text = areas_cnt_text / float(text_only.shape[0] * text_only.shape[1])
-
-                    contours_biggest = contours_only_text_parent[np.argmax(areas_cnt_text)]
-                    contours_only_text_parent = [contours_only_text_parent[jz] for jz in range(len(contours_only_text_parent)) if areas_cnt_text[jz] > min_con_area]
-                    areas_cnt_text_parent = [areas_cnt_text[jz] for jz in range(len(areas_cnt_text)) if areas_cnt_text[jz] > min_con_area]
-
-                    index_con_parents = np.argsort(areas_cnt_text_parent)
-                    contours_only_text_parent = list(np.array(contours_only_text_parent)[index_con_parents])
-                    areas_cnt_text_parent = list(np.array(areas_cnt_text_parent)[index_con_parents])
-
-                    cx_bigest_big, cy_biggest_big, _, _, _, _, _ = find_new_features_of_contoures([contours_biggest])
-                    cx_bigest, cy_biggest, _, _, _, _, _ = find_new_features_of_contoures(contours_only_text_parent)
-                    # print(areas_cnt_text_parent,'areas_cnt_text_parent')
-                    # print(areas_cnt_text_parent_d,'areas_cnt_text_parent_d')
-                    # print(len(contours_only_text_parent),len(contours_only_text_parent_d),'vizzz')
-
-                txt_con_org = get_textregion_contours_in_org_image(contours_only_text_parent, self.image, slope_first)
-                boxes_text, _ = get_text_region_boxes_by_given_contours(contours_only_text_parent)
-                boxes_marginals, _ = get_text_region_boxes_by_given_contours(polygons_of_marginals)
-
-                if not self.curved_line:
-                    slopes, all_found_texline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con = self.get_slopes_and_deskew_new(txt_con_org, contours_only_text_parent, textline_mask_tot_ea, image_page_rotated, boxes_text, slope_deskew)
-                    slopes_marginals, all_found_texline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, index_by_text_par_con_marginal = self.get_slopes_and_deskew_new(polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea, image_page_rotated, boxes_marginals, slope_deskew)
-
-                else:
-                    scale_param = 1
-                    all_found_texline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con, slopes = self.get_slopes_and_deskew_new_curved(txt_con_org, contours_only_text_parent, cv2.erode(textline_mask_tot_ea, kernel=self.kernel, iterations=1), image_page_rotated, boxes_text, text_only, num_col_classifier, scale_param, slope_deskew)
-                    all_found_texline_polygons = small_textlines_to_parent_adherence2(all_found_texline_polygons, textline_mask_tot_ea, num_col_classifier)
-                    all_found_texline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, index_by_text_par_con_marginal, slopes_marginals = self.get_slopes_and_deskew_new_curved(polygons_of_marginals, polygons_of_marginals, cv2.erode(textline_mask_tot_ea, kernel=self.kernel, iterations=1), image_page_rotated, boxes_marginals, text_only, num_col_classifier, scale_param, slope_deskew)
-                    all_found_texline_polygons_marginals = small_textlines_to_parent_adherence2(all_found_texline_polygons_marginals, textline_mask_tot_ea, num_col_classifier)
-                index_of_vertical_text_contours = np.array(range(len(slopes)))[(abs(np.array(slopes)) > 60)]
-                contours_text_vertical = [contours_only_text_parent[i] for i in index_of_vertical_text_contours]
-
                 K.clear_session()
                 gc.collect()
-                # print(index_by_text_par_con,'index_by_text_par_con')
+                patches = False
+                regions_fully_np, _ = self.extract_text_regions(image_page, patches, cols=num_col_classifier)
 
-                if self.full_layout:
-                    if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
-                        contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered)[index_by_text_par_con])
-                        text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_texline_polygons, all_found_texline_polygons_h, slopes, slopes_h, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_texline_polygons, slopes, contours_only_text_parent_d_ordered)
-                    else:
-                        contours_only_text_parent_d_ordered = None
-                        text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_texline_polygons, all_found_texline_polygons_h, slopes, slopes_h, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_texline_polygons, slopes, contours_only_text_parent_d_ordered)
-                        
-                        
+                # plt.imshow(regions_fully_np[:,:,0])
+                # plt.show()
 
-                    if self.plotter:
-                        self.plotter.save_plot_of_layout(text_regions_p, image_page)
-                        self.plotter.save_plot_of_layout_all(text_regions_p, image_page)
-
-                    K.clear_session()
-                    gc.collect()
-
-                    ##print('Job done in: '+str(time.time()-t1))
-
-                    polygons_of_tabels = []
-                    pixel_img = 4
-                    polygons_of_drop_capitals = return_contours_of_interested_region_by_min_size(text_regions_p, pixel_img)
-                    all_found_texline_polygons = adhere_drop_capital_region_into_cprresponding_textline(text_regions_p, polygons_of_drop_capitals, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_texline_polygons, all_found_texline_polygons_h, kernel=self.kernel, curved_line=self.curved_line)
-
-                    # print(len(contours_only_text_parent_h),len(contours_only_text_parent_h_d_ordered),'contours_only_text_parent_h')
-                    pixel_lines = 6
-
-                    if not self.headers_off:
-                        if np.abs(slope_deskew) < SLOPE_THRESHOLD:
-                            num_col, peaks_neg_fin, matrix_of_lines_ch, spliter_y_new, seperators_closeup_n = find_number_of_columns_in_document(np.repeat(text_regions_p[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines, contours_only_text_parent_h)
-                        else:
-                            num_col_d, peaks_neg_fin_d, matrix_of_lines_ch_d, spliter_y_new_d, seperators_closeup_n_d = find_number_of_columns_in_document(np.repeat(text_regions_p_1_n[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines, contours_only_text_parent_h_d_ordered)
-                    elif self.headers_off:
-                        if np.abs(slope_deskew) < SLOPE_THRESHOLD:
-                            num_col, peaks_neg_fin, matrix_of_lines_ch, spliter_y_new, seperators_closeup_n = find_number_of_columns_in_document(np.repeat(text_regions_p[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines)
-                        else:
-                            num_col_d, peaks_neg_fin_d, matrix_of_lines_ch_d, spliter_y_new_d, seperators_closeup_n_d = find_number_of_columns_in_document(np.repeat(text_regions_p_1_n[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines)
-
-                    # print(peaks_neg_fin,peaks_neg_fin_d,'num_col2')
-                    # print(spliter_y_new,spliter_y_new_d,'num_col_classifier')
-                    # print(matrix_of_lines_ch.shape,matrix_of_lines_ch_d.shape,'matrix_of_lines_ch')
-
-                    if num_col_classifier >= 3:
-
-                        if np.abs(slope_deskew) < SLOPE_THRESHOLD:
-                            regions_without_seperators = regions_without_seperators.astype(np.uint8)
-                            regions_without_seperators = cv2.erode(regions_without_seperators[:, :], self.kernel, iterations=6)
-
-                            random_pixels_for_image = np.random.randn(regions_without_seperators.shape[0], regions_without_seperators.shape[1])
-                            random_pixels_for_image[random_pixels_for_image < -0.5] = 0
-                            random_pixels_for_image[random_pixels_for_image != 0] = 1
-
-                            regions_without_seperators[(random_pixels_for_image[:, :] == 1) & (text_regions_p[:, :] == 5)] = 1
-
-                        else:
-
-                            regions_without_seperators_d = regions_without_seperators_d.astype(np.uint8)
-                            regions_without_seperators_d = cv2.erode(regions_without_seperators_d[:, :], self.kernel, iterations=6)
-
-                            random_pixels_for_image = np.random.randn(regions_without_seperators_d.shape[0], regions_without_seperators_d.shape[1])
-                            random_pixels_for_image[random_pixels_for_image < -0.5] = 0
-                            random_pixels_for_image[random_pixels_for_image != 0] = 1
-
-                            regions_without_seperators_d[(random_pixels_for_image[:, :] == 1) & (text_regions_p_1_n[:, :] == 5)] = 1
-                    else:
-                        pass
-
-                    if np.abs(slope_deskew) < SLOPE_THRESHOLD:
-                        boxes = return_boxes_of_images_by_order_of_reading_new(spliter_y_new, regions_without_seperators, matrix_of_lines_ch, num_col_classifier)
-                    else:
-                        boxes_d = return_boxes_of_images_by_order_of_reading_new(spliter_y_new_d, regions_without_seperators_d, matrix_of_lines_ch_d, num_col_classifier)
-
-                # print(slopes)
-                if self.plotter:
-                    self.plotter.write_images_into_directory(polygons_of_images, image_page)
-
-                if self.full_layout:
-                    if np.abs(slope_deskew) < SLOPE_THRESHOLD:
-                        order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot)
-                    else:
-                        order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered, boxes_d, textline_mask_tot_d)
-
-                    self.write_into_page_xml_full(contours_only_text_parent, contours_only_text_parent_h, page_coord, self.dir_out, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, polygons_of_images, polygons_of_tabels, polygons_of_drop_capitals, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals)
+                if num_col_classifier > 2:
+                    regions_fully_np[:, :, 0][regions_fully_np[:, :, 0] == 4] = 0
                 else:
-                    contours_only_text_parent_h = None
-                    # self.logger.debug('bura galmir?')
+                    regions_fully_np = filter_small_drop_capitals_from_no_patch_layout(regions_fully_np, text_regions_p)
+
+                # plt.imshow(regions_fully_np[:,:,0])
+                # plt.show()
+
+                K.clear_session()
+                gc.collect()
+
+                # plt.imshow(regions_fully[:,:,0])
+                # plt.show()
+
+                regions_fully = boosting_headers_by_longshot_region_segmentation(regions_fully, regions_fully_np, img_only_regions)
+
+                # plt.imshow(regions_fully[:,:,0])
+                # plt.show()
+
+                text_regions_p[:, :][regions_fully[:, :, 0] == 4] = 4
+                text_regions_p[:, :][regions_fully_np[:, :, 0] == 4] = 4
+
+                #plt.imshow(text_regions_p)
+                #plt.show()
+
+                if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
+                    image_page_rotated_n, textline_mask_tot_d, text_regions_p_1_n, regions_fully_n = rotation_not_90_func_full_layout(image_page, textline_mask_tot, text_regions_p, regions_fully, slope_deskew)
+
+                    text_regions_p_1_n = resize_image(text_regions_p_1_n, text_regions_p.shape[0], text_regions_p.shape[1])
+                    textline_mask_tot_d = resize_image(textline_mask_tot_d, text_regions_p.shape[0], text_regions_p.shape[1])
+                    regions_fully_n = resize_image(regions_fully_n, text_regions_p.shape[0], text_regions_p.shape[1])
+                    regions_without_seperators_d = (text_regions_p_1_n[:, :] == 1) * 1
+
+                regions_without_seperators = (text_regions_p[:, :] == 1) * 1  # ( (text_regions_p[:,:]==1) | (text_regions_p[:,:]==2) )*1 #self.return_regions_without_seperators_new(text_regions_p[:,:,0],img_only_regions)
+
+                K.clear_session()
+                gc.collect()
+                img_revised_tab = np.copy(text_regions_p[:, :])
+                self.logger.info("detection of full layout took %ss", str(time.time() - t1))
+                pixel_img = 5
+                polygons_of_images = return_contours_of_interested_region(img_revised_tab, pixel_img)
+
+            # plt.imshow(img_revised_tab)
+            # plt.show()
+
+            # print(img_revised_tab.shape,text_regions_p_1_n.shape)
+            # text_regions_p_1_n=resize_image(text_regions_p_1_n,img_revised_tab.shape[0],img_revised_tab.shape[1])
+            # print(np.unique(text_regions_p_1_n),'uni')
+
+            text_only = ((img_revised_tab[:, :] == 1)) * 1
+            if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
+                text_only_d = ((text_regions_p_1_n[:, :] == 1)) * 1
+            ##text_only_h=( (img_revised_tab[:,:,0]==2) )*1
+
+            # print(text_only.shape,text_only_d.shape)
+            # plt.imshow(text_only)
+            # plt.show()
+
+            # plt.imshow(text_only_d)
+            # plt.show()
+
+            min_con_area = 0.000005
+            if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
+                contours_only_text, hir_on_text = return_contours_of_image(text_only)
+                contours_only_text_parent = return_parent_contours(contours_only_text, hir_on_text)
+                areas_cnt_text = np.array([cv2.contourArea(contours_only_text_parent[j]) for j in range(len(contours_only_text_parent))])
+                areas_cnt_text = areas_cnt_text / float(text_only.shape[0] * text_only.shape[1])
+                contours_biggest = contours_only_text_parent[np.argmax(areas_cnt_text)]
+                contours_only_text_parent = [contours_only_text_parent[jz] for jz in range(len(contours_only_text_parent)) if areas_cnt_text[jz] > min_con_area]
+                areas_cnt_text_parent = [areas_cnt_text[jz] for jz in range(len(areas_cnt_text)) if areas_cnt_text[jz] > min_con_area]
+
+                index_con_parents = np.argsort(areas_cnt_text_parent)
+                contours_only_text_parent = list(np.array(contours_only_text_parent)[index_con_parents])
+                areas_cnt_text_parent = list(np.array(areas_cnt_text_parent)[index_con_parents])
+
+                cx_bigest_big, cy_biggest_big, _, _, _, _, _ = find_new_features_of_contoures([contours_biggest])
+                cx_bigest, cy_biggest, _, _, _, _, _ = find_new_features_of_contoures(contours_only_text_parent)
+
+                contours_only_text_d, hir_on_text_d = return_contours_of_image(text_only_d)
+                contours_only_text_parent_d = return_parent_contours(contours_only_text_d, hir_on_text_d)
+
+                areas_cnt_text_d = np.array([cv2.contourArea(contours_only_text_parent_d[j]) for j in range(len(contours_only_text_parent_d))])
+                areas_cnt_text_d = areas_cnt_text_d / float(text_only_d.shape[0] * text_only_d.shape[1])
+
+                contours_biggest_d = contours_only_text_parent_d[np.argmax(areas_cnt_text_d)]
+                index_con_parents_d=np.argsort(areas_cnt_text_d)
+                contours_only_text_parent_d=list(np.array(contours_only_text_parent_d)[index_con_parents_d] )
+                areas_cnt_text_d=list(np.array(areas_cnt_text_d)[index_con_parents_d] )
+
+                cx_bigest_d_big, cy_biggest_d_big, _, _, _, _, _ = find_new_features_of_contoures([contours_biggest_d])
+                cx_bigest_d, cy_biggest_d, _, _, _, _, _ = find_new_features_of_contoures(contours_only_text_parent_d)
+                try:
+                    cx_bigest_d_last5=cx_bigest_d[-5:]
+                    cy_biggest_d_last5=cy_biggest_d[-5:]
+                    dists_d = [math.sqrt((cx_bigest_big[0]-cx_bigest_d_last5[j])**2 + (cy_biggest_big[0]-cy_biggest_d_last5[j])**2) for j in range(len(cy_biggest_d_last5))]
+                    ind_largest=len(cx_bigest_d)-5+np.argmin(dists_d)
+                    cx_bigest_d_big[0]=cx_bigest_d[ind_largest]
+                    cy_biggest_d_big[0]=cy_biggest_d[ind_largest]
+                except:
+                    pass
+
+                (h, w) = text_only.shape[:2]
+                center = (w // 2.0, h // 2.0)
+                M = cv2.getRotationMatrix2D(center, slope_deskew, 1.0)
+
+                M_22 = np.array(M)[:2, :2]
+
+                p_big = np.dot(M_22, [cx_bigest_big, cy_biggest_big])
+
+                x_diff = p_big[0] - cx_bigest_d_big
+                y_diff = p_big[1] - cy_biggest_d_big
+
+                # print(p_big)
+                # print(cx_bigest_d_big,cy_biggest_d_big)
+                # print(x_diff,y_diff)
+
+                contours_only_text_parent_d_ordered = []
+                for i in range(len(contours_only_text_parent)):
+                    # img1=np.zeros((text_only.shape[0],text_only.shape[1],3))
+                    # img1=cv2.fillPoly(img1,pts=[contours_only_text_parent[i]] ,color=(1,1,1))
+                    # plt.imshow(img1[:,:,0])
+                    # plt.show()
+
+                    p = np.dot(M_22, [cx_bigest[i], cy_biggest[i]])
+                    # print(p)
+                    p[0] = p[0] - x_diff[0]
+                    p[1] = p[1] - y_diff[0]
+                    # print(p)
+                    # print(cx_bigest_d)
+                    # print(cy_biggest_d)
+                    dists = [math.sqrt((p[0] - cx_bigest_d[j]) ** 2 + (p[1] - cy_biggest_d[j]) ** 2) for j in range(len(cx_bigest_d))]
+                    # print(np.argmin(dists))
+                    contours_only_text_parent_d_ordered.append(contours_only_text_parent_d[np.argmin(dists)])
+                    # img2=np.zeros((text_only.shape[0],text_only.shape[1],3))
+                    # img2=cv2.fillPoly(img2,pts=[contours_only_text_parent_d[np.argmin(dists)]] ,color=(1,1,1))
+                    # plt.imshow(img2[:,:,0])
+                    # plt.show()
+            else:
+                contours_only_text, hir_on_text = return_contours_of_image(text_only)
+                contours_only_text_parent = return_parent_contours(contours_only_text, hir_on_text)
+
+                areas_cnt_text = np.array([cv2.contourArea(contours_only_text_parent[j]) for j in range(len(contours_only_text_parent))])
+                areas_cnt_text = areas_cnt_text / float(text_only.shape[0] * text_only.shape[1])
+
+                contours_biggest = contours_only_text_parent[np.argmax(areas_cnt_text)]
+                contours_only_text_parent = [contours_only_text_parent[jz] for jz in range(len(contours_only_text_parent)) if areas_cnt_text[jz] > min_con_area]
+                areas_cnt_text_parent = [areas_cnt_text[jz] for jz in range(len(areas_cnt_text)) if areas_cnt_text[jz] > min_con_area]
+
+                index_con_parents = np.argsort(areas_cnt_text_parent)
+                contours_only_text_parent = list(np.array(contours_only_text_parent)[index_con_parents])
+                areas_cnt_text_parent = list(np.array(areas_cnt_text_parent)[index_con_parents])
+
+                cx_bigest_big, cy_biggest_big, _, _, _, _, _ = find_new_features_of_contoures([contours_biggest])
+                cx_bigest, cy_biggest, _, _, _, _, _ = find_new_features_of_contoures(contours_only_text_parent)
+                # print(areas_cnt_text_parent,'areas_cnt_text_parent')
+                # print(areas_cnt_text_parent_d,'areas_cnt_text_parent_d')
+                # print(len(contours_only_text_parent),len(contours_only_text_parent_d),'vizzz')
+
+            txt_con_org = get_textregion_contours_in_org_image(contours_only_text_parent, self.image, slope_first)
+            boxes_text, _ = get_text_region_boxes_by_given_contours(contours_only_text_parent)
+            boxes_marginals, _ = get_text_region_boxes_by_given_contours(polygons_of_marginals)
+
+            if not self.curved_line:
+                slopes, all_found_texline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con = self.get_slopes_and_deskew_new(txt_con_org, contours_only_text_parent, textline_mask_tot_ea, image_page_rotated, boxes_text, slope_deskew)
+                slopes_marginals, all_found_texline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, index_by_text_par_con_marginal = self.get_slopes_and_deskew_new(polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea, image_page_rotated, boxes_marginals, slope_deskew)
+
+            else:
+                scale_param = 1
+                all_found_texline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con, slopes = self.get_slopes_and_deskew_new_curved(txt_con_org, contours_only_text_parent, cv2.erode(textline_mask_tot_ea, kernel=self.kernel, iterations=1), image_page_rotated, boxes_text, text_only, num_col_classifier, scale_param, slope_deskew)
+                all_found_texline_polygons = small_textlines_to_parent_adherence2(all_found_texline_polygons, textline_mask_tot_ea, num_col_classifier)
+                all_found_texline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, index_by_text_par_con_marginal, slopes_marginals = self.get_slopes_and_deskew_new_curved(polygons_of_marginals, polygons_of_marginals, cv2.erode(textline_mask_tot_ea, kernel=self.kernel, iterations=1), image_page_rotated, boxes_marginals, text_only, num_col_classifier, scale_param, slope_deskew)
+                all_found_texline_polygons_marginals = small_textlines_to_parent_adherence2(all_found_texline_polygons_marginals, textline_mask_tot_ea, num_col_classifier)
+            index_of_vertical_text_contours = np.array(range(len(slopes)))[(abs(np.array(slopes)) > 60)]
+            contours_text_vertical = [contours_only_text_parent[i] for i in index_of_vertical_text_contours]
+
+            K.clear_session()
+            gc.collect()
+            # print(index_by_text_par_con,'index_by_text_par_con')
+
+            if self.full_layout:
+                if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
+                    contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered)[index_by_text_par_con])
+                    text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_texline_polygons, all_found_texline_polygons_h, slopes, slopes_h, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_texline_polygons, slopes, contours_only_text_parent_d_ordered)
+                else:
+                    contours_only_text_parent_d_ordered = None
+                    text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_texline_polygons, all_found_texline_polygons_h, slopes, slopes_h, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_texline_polygons, slopes, contours_only_text_parent_d_ordered)
+                    
+                    
+
+                if self.plotter:
+                    self.plotter.save_plot_of_layout(text_regions_p, image_page)
+                    self.plotter.save_plot_of_layout_all(text_regions_p, image_page)
+
+                K.clear_session()
+                gc.collect()
+
+                ##print('Job done in: '+str(time.time()-t1))
+
+                polygons_of_tabels = []
+                pixel_img = 4
+                polygons_of_drop_capitals = return_contours_of_interested_region_by_min_size(text_regions_p, pixel_img)
+                all_found_texline_polygons = adhere_drop_capital_region_into_cprresponding_textline(text_regions_p, polygons_of_drop_capitals, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_texline_polygons, all_found_texline_polygons_h, kernel=self.kernel, curved_line=self.curved_line)
+
+                # print(len(contours_only_text_parent_h),len(contours_only_text_parent_h_d_ordered),'contours_only_text_parent_h')
+                pixel_lines = 6
+
+                if not self.headers_off:
                     if np.abs(slope_deskew) < SLOPE_THRESHOLD:
-                        #contours_only_text_parent = list(np.array(contours_only_text_parent)[index_by_text_par_con])
-                        order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot)
+                        num_col, peaks_neg_fin, matrix_of_lines_ch, spliter_y_new, seperators_closeup_n = find_number_of_columns_in_document(np.repeat(text_regions_p[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines, contours_only_text_parent_h)
                     else:
-                        contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered)[index_by_text_par_con])
-                        order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h, boxes_d, textline_mask_tot_d)
-                    # order_text_new , id_of_texts_tot=self.do_order_of_regions(contours_only_text_parent,contours_only_text_parent_h,boxes,textline_mask_tot)
-                    self.write_into_page_xml(txt_con_org, page_coord, self.dir_out, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, self.curved_line, slopes, slopes_marginals)
+                        num_col_d, peaks_neg_fin_d, matrix_of_lines_ch_d, spliter_y_new_d, seperators_closeup_n_d = find_number_of_columns_in_document(np.repeat(text_regions_p_1_n[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines, contours_only_text_parent_h_d_ordered)
+                elif self.headers_off:
+                    if np.abs(slope_deskew) < SLOPE_THRESHOLD:
+                        num_col, peaks_neg_fin, matrix_of_lines_ch, spliter_y_new, seperators_closeup_n = find_number_of_columns_in_document(np.repeat(text_regions_p[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines)
+                    else:
+                        num_col_d, peaks_neg_fin_d, matrix_of_lines_ch_d, spliter_y_new_d, seperators_closeup_n_d = find_number_of_columns_in_document(np.repeat(text_regions_p_1_n[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines)
+
+                # print(peaks_neg_fin,peaks_neg_fin_d,'num_col2')
+                # print(spliter_y_new,spliter_y_new_d,'num_col_classifier')
+                # print(matrix_of_lines_ch.shape,matrix_of_lines_ch_d.shape,'matrix_of_lines_ch')
+
+                if num_col_classifier >= 3:
+
+                    if np.abs(slope_deskew) < SLOPE_THRESHOLD:
+                        regions_without_seperators = regions_without_seperators.astype(np.uint8)
+                        regions_without_seperators = cv2.erode(regions_without_seperators[:, :], self.kernel, iterations=6)
+
+                        random_pixels_for_image = np.random.randn(regions_without_seperators.shape[0], regions_without_seperators.shape[1])
+                        random_pixels_for_image[random_pixels_for_image < -0.5] = 0
+                        random_pixels_for_image[random_pixels_for_image != 0] = 1
+
+                        regions_without_seperators[(random_pixels_for_image[:, :] == 1) & (text_regions_p[:, :] == 5)] = 1
+
+                    else:
+
+                        regions_without_seperators_d = regions_without_seperators_d.astype(np.uint8)
+                        regions_without_seperators_d = cv2.erode(regions_without_seperators_d[:, :], self.kernel, iterations=6)
+
+                        random_pixels_for_image = np.random.randn(regions_without_seperators_d.shape[0], regions_without_seperators_d.shape[1])
+                        random_pixels_for_image[random_pixels_for_image < -0.5] = 0
+                        random_pixels_for_image[random_pixels_for_image != 0] = 1
+
+                        regions_without_seperators_d[(random_pixels_for_image[:, :] == 1) & (text_regions_p_1_n[:, :] == 5)] = 1
+                else:
+                    pass
+
+                if np.abs(slope_deskew) < SLOPE_THRESHOLD:
+                    boxes = return_boxes_of_images_by_order_of_reading_new(spliter_y_new, regions_without_seperators, matrix_of_lines_ch, num_col_classifier)
+                else:
+                    boxes_d = return_boxes_of_images_by_order_of_reading_new(spliter_y_new_d, regions_without_seperators_d, matrix_of_lines_ch_d, num_col_classifier)
+
+            # print(slopes)
+            if self.plotter:
+                self.plotter.write_images_into_directory(polygons_of_images, image_page)
+
+            if self.full_layout:
+                if np.abs(slope_deskew) < SLOPE_THRESHOLD:
+                    order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot)
+                else:
+                    order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered, boxes_d, textline_mask_tot_d)
+
+                self.write_into_page_xml_full(contours_only_text_parent, contours_only_text_parent_h, page_coord, self.dir_out, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, polygons_of_images, polygons_of_tabels, polygons_of_drop_capitals, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals)
+            else:
+                contours_only_text_parent_h = None
+                # self.logger.debug('bura galmir?')
+                if np.abs(slope_deskew) < SLOPE_THRESHOLD:
+                    #contours_only_text_parent = list(np.array(contours_only_text_parent)[index_by_text_par_con])
+                    order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot)
+                else:
+                    contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered)[index_by_text_par_con])
+                    order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h, boxes_d, textline_mask_tot_d)
+                # order_text_new , id_of_texts_tot=self.do_order_of_regions(contours_only_text_parent,contours_only_text_parent_h,boxes,textline_mask_tot)
+                self.write_into_page_xml(txt_con_org, page_coord, self.dir_out, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, self.curved_line, slopes, slopes_marginals)
 
         self.logger.info("Job done in %ss", str(time.time() - t1))

From a14462db22dba86c3d15c7f6bc335d7b666622e3 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Fri, 5 Feb 2021 14:58:49 +0100
Subject: [PATCH 34/89] remove else-pass, reset t1 after measuring

---
 sbb_newspapers_org_image/eynollah.py | 25 ++++++++-----------------
 1 file changed, 8 insertions(+), 17 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index bb8b12d..1b8eda0 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -749,8 +749,6 @@ class eynollah:
             img = img.astype(np.uint8)
             if img_width_h >= 2000:
                 img = resize_image(img, int(img_height_h * 0.9), int(img_width_h * 0.9))
-            else:
-                pass  # img= resize_image(img, int(img_height_h*1), int(img_width_h*1) )
             img = img.astype(np.uint8)
 
         if patches and cols == 1:
@@ -1808,9 +1806,7 @@ class eynollah:
         rate_two_models=text_sume_second/float(text_sume_early)*100
 
         self.logger.info("ratio_of_two_models: %s", rate_two_models)
-        if is_image_enhanced and rate_two_models<95.50:#98.45:
-            pass
-        else:
+        if not(is_image_enhanced and rate_two_models<95.50):#98.45:
             prediction_regions_org=np.copy(prediction_regions_org_copy)
 
         ##prediction_regions_org[mask_lines2[:,:]==1]=3
@@ -2178,11 +2174,13 @@ class eynollah:
                 img_org, img_res, is_image_enhanced = self.resize_image_with_column_classifier(is_image_enhanced)
                 self.get_image_and_scales_after_enhancing(img_org, img_res)
         self.logger.info("Enhancing took %ss ", str(time.time() - t1))
+        t1 = time.time()
 
         text_regions_p_1 = self.get_regions_from_xy_2models(img_res, is_image_enhanced)
         K.clear_session()
         gc.collect()
         self.logger.info("Textregion detection took %ss " + str(time.time() - t1))
+        t1 = time.time()
 
         img_g = cv2.imread(self.image_filename, cv2.IMREAD_GRAYSCALE)
         img_g = img_g.astype(np.uint8)
@@ -2251,6 +2249,7 @@ class eynollah:
             if self.plotter:
                 self.plotter.save_plot_of_textlines(textline_mask_tot_ea, image_page)
             self.logger.info("textline detection took %ss", str(time.time() - t1))
+            t1 = time.time()
             # plt.imshow(textline_mask_tot_ea)
             # plt.show()
             # sys.exit()
@@ -2269,6 +2268,7 @@ class eynollah:
             ##plt.show()
             ##sys.exit()
             self.logger.info("deskewing: " + str(time.time() - t1))
+            t1 = time.time()
 
             image_page_rotated, textline_mask_tot = image_page[:, :], textline_mask_tot_ea[:, :]
             textline_mask_tot[mask_images[:, :] == 1] = 0
@@ -2285,9 +2285,7 @@ class eynollah:
                 try:
                     regions_without_seperators = (text_regions_p[:, :] == 1) * 1
                     regions_without_seperators = regions_without_seperators.astype(np.uint8)
-
                     text_regions_p = get_marginals(rotate_image(regions_without_seperators, slope_deskew), text_regions_p, num_col_classifier, slope_deskew, kernel=self.kernel)
-
                 except:
                     pass
 
@@ -2299,6 +2297,7 @@ class eynollah:
                 self.plotter.save_plot_of_layout_main(text_regions_p, image_page)
 
             self.logger.info("detection of marginals took %ss", str(time.time() - t1))
+            t1 = time.time()
 
             if not self.full_layout:
 
@@ -2344,6 +2343,7 @@ class eynollah:
 
                 self.logger.debug("len(boxes): %s", len(boxes))
                 self.logger.info("detecting boxes took %ss", str(time.time() - t1))
+                t1 = time.time()
                 img_revised_tab = text_regions_p[:, :]
                 pixel_img = 2
                 polygons_of_images = return_contours_of_interested_region(img_revised_tab, pixel_img)
@@ -2431,6 +2431,7 @@ class eynollah:
                 gc.collect()
                 img_revised_tab = np.copy(text_regions_p[:, :])
                 self.logger.info("detection of full layout took %ss", str(time.time() - t1))
+                t1 = time.time()
                 pixel_img = 5
                 polygons_of_images = return_contours_of_interested_region(img_revised_tab, pixel_img)
 
@@ -2588,8 +2589,6 @@ class eynollah:
                 K.clear_session()
                 gc.collect()
 
-                ##print('Job done in: '+str(time.time()-t1))
-
                 polygons_of_tabels = []
                 pixel_img = 4
                 polygons_of_drop_capitals = return_contours_of_interested_region_by_min_size(text_regions_p, pixel_img)
@@ -2618,25 +2617,17 @@ class eynollah:
                     if np.abs(slope_deskew) < SLOPE_THRESHOLD:
                         regions_without_seperators = regions_without_seperators.astype(np.uint8)
                         regions_without_seperators = cv2.erode(regions_without_seperators[:, :], self.kernel, iterations=6)
-
                         random_pixels_for_image = np.random.randn(regions_without_seperators.shape[0], regions_without_seperators.shape[1])
                         random_pixels_for_image[random_pixels_for_image < -0.5] = 0
                         random_pixels_for_image[random_pixels_for_image != 0] = 1
-
                         regions_without_seperators[(random_pixels_for_image[:, :] == 1) & (text_regions_p[:, :] == 5)] = 1
-
                     else:
-
                         regions_without_seperators_d = regions_without_seperators_d.astype(np.uint8)
                         regions_without_seperators_d = cv2.erode(regions_without_seperators_d[:, :], self.kernel, iterations=6)
-
                         random_pixels_for_image = np.random.randn(regions_without_seperators_d.shape[0], regions_without_seperators_d.shape[1])
                         random_pixels_for_image[random_pixels_for_image < -0.5] = 0
                         random_pixels_for_image[random_pixels_for_image != 0] = 1
-
                         regions_without_seperators_d[(random_pixels_for_image[:, :] == 1) & (text_regions_p_1_n[:, :] == 5)] = 1
-                else:
-                    pass
 
                 if np.abs(slope_deskew) < SLOPE_THRESHOLD:
                     boxes = return_boxes_of_images_by_order_of_reading_new(spliter_y_new, regions_without_seperators, matrix_of_lines_ch, num_col_classifier)

From bf6eaafbc730804757d4db7334937f8c30e58aad Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Fri, 5 Feb 2021 16:16:13 +0100
Subject: [PATCH 35/89] untangle run

---
 sbb_newspapers_org_image/eynollah.py | 61 +++++++++++++---------------
 1 file changed, 28 insertions(+), 33 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index 1b8eda0..d924a19 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -387,7 +387,7 @@ class eynollah:
 
         return img, img_new, is_image_enhanced
 
-    def resize_and_enhance_image_with_column_classifier(self, is_image_enhanced):
+    def resize_and_enhance_image_with_column_classifier(self):
         self.logger.debug("enter resize_and_enhance_image_with_column_classifier")
         dpi = self.check_dpi()
         self.logger.info("Detected %s DPI" % dpi)
@@ -432,19 +432,17 @@ class eynollah:
         del page_coord
         K.clear_session()
         gc.collect()
-        self.logger.info("%s DPI" % dpi)
 
         if dpi < 298:
             img_new, num_column_is_classified = self.calculate_width_height_by_columns(img, num_col, width_early, label_p_pred)
             image_res = self.predict_enhancement(img_new)
-            # cv2.imwrite(os.path.join(self.dir_out, self.image_filename_stem) + ".tif",self.image)
-            # self.image=self.image.astype(np.uint16)
             is_image_enhanced = True
         else:
             is_image_enhanced = False
             num_column_is_classified = True
             image_res = np.copy(img)
 
+        self.logger.debug("exit resize_and_enhance_image_with_column_classifier")
         return is_image_enhanced, img, image_res, num_col, num_column_is_classified
 
     def get_image_and_scales(self, img_org, img_res, scale):
@@ -463,10 +461,10 @@ class eynollah:
 
         # Also set for the plotter
         # XXX TODO hacky
-        #self.plotter.image_org = self.image_org
-        
-        #self.plotter.scale_y = self.scale_y
-        #self.plotter.scale_x = self.scale_x
+        if self.plotter:
+            self.plotter.image_org = self.image_org
+            self.plotter.scale_y = self.scale_y
+            self.plotter.scale_x = self.scale_x
 
 
     def get_image_and_scales_after_enhancing(self, img_org, img_res):
@@ -625,7 +623,7 @@ class eynollah:
         return prediction_true
 
     def early_page_for_num_of_column_classification(self):
-        self.logger.debug("enter resize_and_enhance_image_with_column_classifier")
+        self.logger.debug("enter early_page_for_num_of_column_classification")
         img = cv2.imread(self.image_filename)
         img = img.astype(np.uint8)
         patches = False
@@ -661,7 +659,7 @@ class eynollah:
         del img_page_prediction
 
         gc.collect()
-        self.logger.debug("exit resize_and_enhance_image_with_column_classifier")
+        self.logger.debug("exit early_page_for_num_of_column_classification")
         return croped_page, page_coord
 
     def extract_page(self):
@@ -1852,6 +1850,7 @@ class eynollah:
         del img_org
         gc.collect()
 
+        K.clear_session()
         return text_regions_p_true
 
     def do_order_of_regions_full_layout(self, contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot):
@@ -2141,21 +2140,10 @@ class eynollah:
             return self.do_order_of_regions_full_layout(*args, **kwargs)
         return self.do_order_of_regions_no_full_layout(*args, **kwargs)
 
-    def run(self):
-        """
-        Get image and scales, then extract the page of scanned image
-        """
-        self.logger.debug("enter run")
-        is_image_enhanced = False
-        t1 = time.time()
-
-        ##########
-
-        ###is_image_enhanced,img_org,img_res=self.resize_and_enhance_image(is_image_enhanced)
+    def run_enhancement(self):
         self.logger.info("resize and enhance image")
-        is_image_enhanced, img_org, img_res, num_col_classifier, num_column_is_classified = self.resize_and_enhance_image_with_column_classifier(is_image_enhanced)
+        is_image_enhanced, img_org, img_res, _, num_column_is_classified = self.resize_and_enhance_image_with_column_classifier()
         self.logger.info("Image is %senhanced", '' if is_image_enhanced else 'not ')
-
         K.clear_session()
         scale = 1
         if is_image_enhanced:
@@ -2173,13 +2161,22 @@ class eynollah:
             if self.allow_scaling:
                 img_org, img_res, is_image_enhanced = self.resize_image_with_column_classifier(is_image_enhanced)
                 self.get_image_and_scales_after_enhancing(img_org, img_res)
-        self.logger.info("Enhancing took %ss ", str(time.time() - t1))
-        t1 = time.time()
+        return img_res, is_image_enhanced, num_column_is_classified
 
+    def run(self):
+        """
+        Get image and scales, then extract the page of scanned image
+        """
+        self.logger.debug("enter run")
+        is_image_enhanced = False
+
+        t1 = time.time()
+        img_res, is_image_enhanced, num_column_is_classified = self.run_enhancement()
+        self.logger.info("Enhancing took %ss ", str(time.time() - t1))
+
+        t1 = time.time()
         text_regions_p_1 = self.get_regions_from_xy_2models(img_res, is_image_enhanced)
-        K.clear_session()
-        gc.collect()
-        self.logger.info("Textregion detection took %ss " + str(time.time() - t1))
+        self.logger.info("Textregion detection took %ss ", str(time.time() - t1))
         t1 = time.time()
 
         img_g = cv2.imread(self.image_filename, cv2.IMREAD_GRAYSCALE)
@@ -2224,7 +2221,8 @@ class eynollah:
             peaks_neg_fin = []
 
         #print(num_col, "num_colnum_col")
-        if num_col is None:
+        if not num_col:
+            self.logger.info("No columns detected, outputting an empty PAGE-XML")
             txt_con_org = []
             order_text_new = []
             id_of_texts_tot = []
@@ -2252,7 +2250,6 @@ class eynollah:
             t1 = time.time()
             # plt.imshow(textline_mask_tot_ea)
             # plt.show()
-            # sys.exit()
 
             sigma = 2
             main_page_deskew = True
@@ -2261,12 +2258,11 @@ class eynollah:
 
             if self.plotter:
                 self.plotter.save_deskewed_image(slope_deskew)
-            # img_rotated=rotyate_image_different(self.image_org,slope_deskew)
             self.logger.info("slope_deskew: %s", slope_deskew)
 
             ##plt.imshow(img_rotated)
             ##plt.show()
-            ##sys.exit()
+
             self.logger.info("deskewing: " + str(time.time() - t1))
             t1 = time.time()
 
@@ -2634,7 +2630,6 @@ class eynollah:
                 else:
                     boxes_d = return_boxes_of_images_by_order_of_reading_new(spliter_y_new_d, regions_without_seperators_d, matrix_of_lines_ch_d, num_col_classifier)
 
-            # print(slopes)
             if self.plotter:
                 self.plotter.write_images_into_directory(polygons_of_images, image_page)
 

From 9dca7426948e1a707db2624dc37c1a19699ba3ff Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Fri, 5 Feb 2021 17:26:45 +0100
Subject: [PATCH 36/89] further untangle run

---
 sbb_newspapers_org_image/eynollah.py | 854 +++++++++++++--------------
 1 file changed, 424 insertions(+), 430 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index d924a19..5bf67e8 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -705,7 +705,9 @@ class eynollah:
         del img
         del imgray
 
+        K.clear_session()
         gc.collect()
+        self.logger.debug("exit extract_page")
         return croped_page, page_coord
 
     def extract_text_regions(self, img, patches, cols):
@@ -2140,6 +2142,45 @@ class eynollah:
             return self.do_order_of_regions_full_layout(*args, **kwargs)
         return self.do_order_of_regions_no_full_layout(*args, **kwargs)
 
+    def run_graphics_and_columns(self, text_regions_p_1, num_column_is_classified):
+        img_g = cv2.imread(self.image_filename, cv2.IMREAD_GRAYSCALE)
+        img_g = img_g.astype(np.uint8)
+
+        img_g3 = np.zeros((img_g.shape[0], img_g.shape[1], 3))
+        img_g3 = img_g3.astype(np.uint8)
+        img_g3[:, :, 0] = img_g[:, :]
+        img_g3[:, :, 1] = img_g[:, :]
+        img_g3[:, :, 2] = img_g[:, :]
+
+        image_page, page_coord = self.extract_page()
+        if self.plotter:
+            self.plotter.save_page_image(image_page)
+
+        img_g3_page = img_g3[page_coord[0] : page_coord[1], page_coord[2] : page_coord[3], :]
+
+        text_regions_p_1 = text_regions_p_1[page_coord[0] : page_coord[1], page_coord[2] : page_coord[3]]
+
+        mask_images = (text_regions_p_1[:, :] == 2) * 1
+        mask_images = mask_images.astype(np.uint8)
+        mask_images = cv2.erode(mask_images[:, :], self.kernel, iterations=10)
+
+        mask_lines = (text_regions_p_1[:, :] == 3) * 1
+        mask_lines = mask_lines.astype(np.uint8)
+
+        img_only_regions_with_sep = ((text_regions_p_1[:, :] != 3) & (text_regions_p_1[:, :] != 0)) * 1
+        img_only_regions_with_sep = img_only_regions_with_sep.astype(np.uint8)
+        img_only_regions = cv2.erode(img_only_regions_with_sep[:, :], self.kernel, iterations=6)
+
+        try:
+            num_col, peaks_neg_fin = find_num_col(img_only_regions, multiplier=6.0)
+            if not num_column_is_classified:
+                num_col_classifier = num_col + 1
+        except:
+            num_col = None
+            peaks_neg_fin = []
+            num_col_classifier = None
+        return num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines
+
     def run_enhancement(self):
         self.logger.info("resize and enhance image")
         is_image_enhanced, img_org, img_res, _, num_column_is_classified = self.resize_and_enhance_image_with_column_classifier()
@@ -2163,6 +2204,7 @@ class eynollah:
                 self.get_image_and_scales_after_enhancing(img_org, img_res)
         return img_res, is_image_enhanced, num_column_is_classified
 
+
     def run(self):
         """
         Get image and scales, then extract the page of scanned image
@@ -2177,479 +2219,431 @@ class eynollah:
         t1 = time.time()
         text_regions_p_1 = self.get_regions_from_xy_2models(img_res, is_image_enhanced)
         self.logger.info("Textregion detection took %ss ", str(time.time() - t1))
+
         t1 = time.time()
-
-        img_g = cv2.imread(self.image_filename, cv2.IMREAD_GRAYSCALE)
-        img_g = img_g.astype(np.uint8)
-
-        img_g3 = np.zeros((img_g.shape[0], img_g.shape[1], 3))
-        img_g3 = img_g3.astype(np.uint8)
-        img_g3[:, :, 0] = img_g[:, :]
-        img_g3[:, :, 1] = img_g[:, :]
-        img_g3[:, :, 2] = img_g[:, :]
-
-        image_page, page_coord = self.extract_page()
-        # print(image_page.shape,'page')
-        if self.plotter:
-            self.plotter.save_page_image(image_page)
-        K.clear_session()
-        gc.collect()
-
-        img_g3_page = img_g3[page_coord[0] : page_coord[1], page_coord[2] : page_coord[3], :]
-        del img_g3
-        del img_g
-
-        text_regions_p_1 = text_regions_p_1[page_coord[0] : page_coord[1], page_coord[2] : page_coord[3]]
-
-        mask_images = (text_regions_p_1[:, :] == 2) * 1
-        mask_images = mask_images.astype(np.uint8)
-        mask_images = cv2.erode(mask_images[:, :], self.kernel, iterations=10)
-
-        mask_lines = (text_regions_p_1[:, :] == 3) * 1
-        mask_lines = mask_lines.astype(np.uint8)
-
-        img_only_regions_with_sep = ((text_regions_p_1[:, :] != 3) & (text_regions_p_1[:, :] != 0)) * 1
-        img_only_regions_with_sep = img_only_regions_with_sep.astype(np.uint8)
-        img_only_regions = cv2.erode(img_only_regions_with_sep[:, :], self.kernel, iterations=6)
-
-        try:
-            num_col, peaks_neg_fin = find_num_col(img_only_regions, multiplier=6.0)
-            if not num_column_is_classified:
-                num_col_classifier = num_col + 1
-        except:
-            num_col = None
-            peaks_neg_fin = []
+        num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines = self.run_graphics_and_columns(text_regions_p_1, num_column_is_classified)
+        self.logger.info("Graphics detection took %ss ", str(time.time() - t1))
 
         #print(num_col, "num_colnum_col")
         if not num_col:
             self.logger.info("No columns detected, outputting an empty PAGE-XML")
-            txt_con_org = []
-            order_text_new = []
-            id_of_texts_tot = []
-            all_found_texline_polygons = []
-            all_box_coord = []
-            polygons_of_images = []
-            polygons_of_marginals = []
-            all_found_texline_polygons_marginals = []
-            all_box_coord_marginals = []
-            slopes = []
-            slopes_marginals = []
-            self.write_into_page_xml(txt_con_org, page_coord, self.dir_out, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, self.curved_line, slopes, slopes_marginals)
-        else:
-            patches = True
-            scaler_h_textline = 1  # 1.2#1.2
-            scaler_w_textline = 1  # 0.9#1
-            textline_mask_tot_ea, textline_mask_tot_long_shot = self.textline_contours(image_page, patches, scaler_h_textline, scaler_w_textline)
+            self.write_into_page_xml([], page_coord, self.dir_out, [], [], [], [], [], [], [], [], self.curved_line, [], [])
+            self.logger.info("Job done in %ss", str(time.time() - t1))
+            return
+        patches = True
+        scaler_h_textline = 1  # 1.2#1.2
+        scaler_w_textline = 1  # 0.9#1
+        textline_mask_tot_ea, textline_mask_tot_long_shot = self.textline_contours(image_page, patches, scaler_h_textline, scaler_w_textline)
 
+        K.clear_session()
+        gc.collect()
+        #print(np.unique(textline_mask_tot_ea[:, :]), "textline")
+        if self.plotter:
+            self.plotter.save_plot_of_textlines(textline_mask_tot_ea, image_page)
+        self.logger.info("textline detection took %ss", str(time.time() - t1))
+        t1 = time.time()
+        # plt.imshow(textline_mask_tot_ea)
+        # plt.show()
+
+        sigma = 2
+        main_page_deskew = True
+        slope_deskew = return_deskew_slop(cv2.erode(textline_mask_tot_ea, self.kernel, iterations=2), sigma, main_page_deskew, plotter=self.plotter)
+        slope_first = 0  # return_deskew_slop(cv2.erode(textline_mask_tot_ea, self.kernel, iterations=2),sigma, plotter=self.plotter)
+
+        if self.plotter:
+            self.plotter.save_deskewed_image(slope_deskew)
+        self.logger.info("slope_deskew: %s", slope_deskew)
+
+        ##plt.imshow(img_rotated)
+        ##plt.show()
+
+        self.logger.info("deskewing: " + str(time.time() - t1))
+        t1 = time.time()
+
+        image_page_rotated, textline_mask_tot = image_page[:, :], textline_mask_tot_ea[:, :]
+        textline_mask_tot[mask_images[:, :] == 1] = 0
+
+        pixel_img = 1
+        min_area = 0.00001
+        max_area = 0.0006
+        textline_mask_tot_small_size = return_contours_of_interested_region_by_size(textline_mask_tot, pixel_img, min_area, max_area)
+        text_regions_p_1[mask_lines[:, :] == 1] = 3
+        text_regions_p = text_regions_p_1[:, :]  # long_short_region[:,:]#self.get_regions_from_2_models(image_page)
+        text_regions_p = np.array(text_regions_p)
+
+        if num_col_classifier == 1 or num_col_classifier == 2:
+            try:
+                regions_without_seperators = (text_regions_p[:, :] == 1) * 1
+                regions_without_seperators = regions_without_seperators.astype(np.uint8)
+                text_regions_p = get_marginals(rotate_image(regions_without_seperators, slope_deskew), text_regions_p, num_col_classifier, slope_deskew, kernel=self.kernel)
+            except:
+                pass
+
+        # plt.imshow(text_regions_p)
+        # plt.show()
+
+        if self.plotter:
+            self.plotter.save_plot_of_layout_main_all(text_regions_p, image_page)
+            self.plotter.save_plot_of_layout_main(text_regions_p, image_page)
+
+        self.logger.info("detection of marginals took %ss", str(time.time() - t1))
+        t1 = time.time()
+
+        if not self.full_layout:
+
+            if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
+                image_page_rotated_n, textline_mask_tot_d, text_regions_p_1_n = rotation_not_90_func(image_page, textline_mask_tot, text_regions_p, slope_deskew)
+                text_regions_p_1_n = resize_image(text_regions_p_1_n, text_regions_p.shape[0], text_regions_p.shape[1])
+                textline_mask_tot_d = resize_image(textline_mask_tot_d, text_regions_p.shape[0], text_regions_p.shape[1])
+                regions_without_seperators_d = (text_regions_p_1_n[:, :] == 1) * 1
+            regions_without_seperators = (text_regions_p[:, :] == 1) * 1  # ( (text_regions_p[:,:]==1) | (text_regions_p[:,:]==2) )*1 #self.return_regions_without_seperators_new(text_regions_p[:,:,0],img_only_regions)
+
+            pixel_lines = 3
+            if np.abs(slope_deskew) < SLOPE_THRESHOLD:
+                num_col, peaks_neg_fin, matrix_of_lines_ch, spliter_y_new, seperators_closeup_n = find_number_of_columns_in_document(np.repeat(text_regions_p[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines)
+
+            if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
+                num_col_d, peaks_neg_fin_d, matrix_of_lines_ch_d, spliter_y_new_d, seperators_closeup_n_d = find_number_of_columns_in_document(np.repeat(text_regions_p_1_n[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines)
             K.clear_session()
             gc.collect()
-            #print(np.unique(textline_mask_tot_ea[:, :]), "textline")
-            if self.plotter:
-                self.plotter.save_plot_of_textlines(textline_mask_tot_ea, image_page)
-            self.logger.info("textline detection took %ss", str(time.time() - t1))
-            t1 = time.time()
-            # plt.imshow(textline_mask_tot_ea)
-            # plt.show()
 
-            sigma = 2
-            main_page_deskew = True
-            slope_deskew = return_deskew_slop(cv2.erode(textline_mask_tot_ea, self.kernel, iterations=2), sigma, main_page_deskew, plotter=self.plotter)
-            slope_first = 0  # return_deskew_slop(cv2.erode(textline_mask_tot_ea, self.kernel, iterations=2),sigma, plotter=self.plotter)
+            self.logger.info("num_col_classifier: %s", num_col_classifier)
 
-            if self.plotter:
-                self.plotter.save_deskewed_image(slope_deskew)
-            self.logger.info("slope_deskew: %s", slope_deskew)
-
-            ##plt.imshow(img_rotated)
-            ##plt.show()
-
-            self.logger.info("deskewing: " + str(time.time() - t1))
-            t1 = time.time()
-
-            image_page_rotated, textline_mask_tot = image_page[:, :], textline_mask_tot_ea[:, :]
-            textline_mask_tot[mask_images[:, :] == 1] = 0
-
-            pixel_img = 1
-            min_area = 0.00001
-            max_area = 0.0006
-            textline_mask_tot_small_size = return_contours_of_interested_region_by_size(textline_mask_tot, pixel_img, min_area, max_area)
-            text_regions_p_1[mask_lines[:, :] == 1] = 3
-            text_regions_p = text_regions_p_1[:, :]  # long_short_region[:,:]#self.get_regions_from_2_models(image_page)
-            text_regions_p = np.array(text_regions_p)
-
-            if num_col_classifier == 1 or num_col_classifier == 2:
-                try:
-                    regions_without_seperators = (text_regions_p[:, :] == 1) * 1
+            if num_col_classifier >= 3:
+                if np.abs(slope_deskew) < SLOPE_THRESHOLD:
                     regions_without_seperators = regions_without_seperators.astype(np.uint8)
-                    text_regions_p = get_marginals(rotate_image(regions_without_seperators, slope_deskew), text_regions_p, num_col_classifier, slope_deskew, kernel=self.kernel)
-                except:
-                    pass
+                    regions_without_seperators = cv2.erode(regions_without_seperators[:, :], self.kernel, iterations=6)
+                    #random_pixels_for_image = np.random.randn(regions_without_seperators.shape[0], regions_without_seperators.shape[1])
+                    #random_pixels_for_image[random_pixels_for_image < -0.5] = 0
+                    #random_pixels_for_image[random_pixels_for_image != 0] = 1
+                    #regions_without_seperators[(random_pixels_for_image[:, :] == 1) & (text_regions_p[:, :] == 2)] = 1
+                else:
+                    regions_without_seperators_d = regions_without_seperators_d.astype(np.uint8)
+                    regions_without_seperators_d = cv2.erode(regions_without_seperators_d[:, :], self.kernel, iterations=6)
+                    #random_pixels_for_image = np.random.randn(regions_without_seperators_d.shape[0], regions_without_seperators_d.shape[1])
+                    #random_pixels_for_image[random_pixels_for_image < -0.5] = 0
+                    #random_pixels_for_image[random_pixels_for_image != 0] = 1
 
-            # plt.imshow(text_regions_p)
-            # plt.show()
+                    #regions_without_seperators_d[(random_pixels_for_image[:, :] == 1) & (text_regions_p_1_n[:, :] == 2)] = 1
 
-            if self.plotter:
-                self.plotter.save_plot_of_layout_main_all(text_regions_p, image_page)
-                self.plotter.save_plot_of_layout_main(text_regions_p, image_page)
+            if np.abs(slope_deskew) < SLOPE_THRESHOLD:
+                boxes = return_boxes_of_images_by_order_of_reading_new(spliter_y_new, regions_without_seperators, matrix_of_lines_ch, num_col_classifier)
+            else:
+                boxes_d = return_boxes_of_images_by_order_of_reading_new(spliter_y_new_d, regions_without_seperators_d, matrix_of_lines_ch_d, num_col_classifier)
 
-            self.logger.info("detection of marginals took %ss", str(time.time() - t1))
+            self.logger.debug("len(boxes): %s", len(boxes))
+            self.logger.info("detecting boxes took %ss", str(time.time() - t1))
             t1 = time.time()
-
-            if not self.full_layout:
-
-                if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
-                    image_page_rotated_n, textline_mask_tot_d, text_regions_p_1_n = rotation_not_90_func(image_page, textline_mask_tot, text_regions_p, slope_deskew)
-                    text_regions_p_1_n = resize_image(text_regions_p_1_n, text_regions_p.shape[0], text_regions_p.shape[1])
-                    textline_mask_tot_d = resize_image(textline_mask_tot_d, text_regions_p.shape[0], text_regions_p.shape[1])
-                    regions_without_seperators_d = (text_regions_p_1_n[:, :] == 1) * 1
-                regions_without_seperators = (text_regions_p[:, :] == 1) * 1  # ( (text_regions_p[:,:]==1) | (text_regions_p[:,:]==2) )*1 #self.return_regions_without_seperators_new(text_regions_p[:,:,0],img_only_regions)
-
-                pixel_lines = 3
-                if np.abs(slope_deskew) < SLOPE_THRESHOLD:
-                    num_col, peaks_neg_fin, matrix_of_lines_ch, spliter_y_new, seperators_closeup_n = find_number_of_columns_in_document(np.repeat(text_regions_p[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines)
-
-                if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
-                    num_col_d, peaks_neg_fin_d, matrix_of_lines_ch_d, spliter_y_new_d, seperators_closeup_n_d = find_number_of_columns_in_document(np.repeat(text_regions_p_1_n[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines)
-                K.clear_session()
-                gc.collect()
-
-                self.logger.info("num_col_classifier: %s", num_col_classifier)
-
-                if num_col_classifier >= 3:
-                    if np.abs(slope_deskew) < SLOPE_THRESHOLD:
-                        regions_without_seperators = regions_without_seperators.astype(np.uint8)
-                        regions_without_seperators = cv2.erode(regions_without_seperators[:, :], self.kernel, iterations=6)
-                        #random_pixels_for_image = np.random.randn(regions_without_seperators.shape[0], regions_without_seperators.shape[1])
-                        #random_pixels_for_image[random_pixels_for_image < -0.5] = 0
-                        #random_pixels_for_image[random_pixels_for_image != 0] = 1
-                        #regions_without_seperators[(random_pixels_for_image[:, :] == 1) & (text_regions_p[:, :] == 2)] = 1
-                    else:
-                        regions_without_seperators_d = regions_without_seperators_d.astype(np.uint8)
-                        regions_without_seperators_d = cv2.erode(regions_without_seperators_d[:, :], self.kernel, iterations=6)
-                        #random_pixels_for_image = np.random.randn(regions_without_seperators_d.shape[0], regions_without_seperators_d.shape[1])
-                        #random_pixels_for_image[random_pixels_for_image < -0.5] = 0
-                        #random_pixels_for_image[random_pixels_for_image != 0] = 1
-
-                        #regions_without_seperators_d[(random_pixels_for_image[:, :] == 1) & (text_regions_p_1_n[:, :] == 2)] = 1
-
-                if np.abs(slope_deskew) < SLOPE_THRESHOLD:
-                    boxes = return_boxes_of_images_by_order_of_reading_new(spliter_y_new, regions_without_seperators, matrix_of_lines_ch, num_col_classifier)
-                else:
-                    boxes_d = return_boxes_of_images_by_order_of_reading_new(spliter_y_new_d, regions_without_seperators_d, matrix_of_lines_ch_d, num_col_classifier)
-
-                self.logger.debug("len(boxes): %s", len(boxes))
-                self.logger.info("detecting boxes took %ss", str(time.time() - t1))
-                t1 = time.time()
-                img_revised_tab = text_regions_p[:, :]
-                pixel_img = 2
-                polygons_of_images = return_contours_of_interested_region(img_revised_tab, pixel_img)
-
-                # plt.imshow(img_revised_tab)
-                # plt.show()
-                K.clear_session()
-
-            pixel_img = 4
-            min_area_mar = 0.00001
-            polygons_of_marginals = return_contours_of_interested_region(text_regions_p, pixel_img, min_area_mar)
-
-            if self.full_layout:
-                # set first model with second model
-                text_regions_p[:, :][text_regions_p[:, :] == 2] = 5
-                text_regions_p[:, :][text_regions_p[:, :] == 3] = 6
-                text_regions_p[:, :][text_regions_p[:, :] == 4] = 8
-
-                K.clear_session()
-                # gc.collect()
-                patches = True
-                image_page = image_page.astype(np.uint8)
-
-                # print(type(image_page))
-                regions_fully, regions_fully_only_drop = self.extract_text_regions(image_page, patches, cols=num_col_classifier)
-                text_regions_p[:,:][regions_fully[:,:,0]==6]=6
-
-                regions_fully_only_drop = put_drop_out_from_only_drop_model(regions_fully_only_drop, text_regions_p)
-                regions_fully[:, :, 0][regions_fully_only_drop[:, :, 0] == 4] = 4
-                K.clear_session()
-                gc.collect()
-
-                # plt.imshow(regions_fully[:,:,0])
-                # plt.show()
-
-                regions_fully = putt_bb_of_drop_capitals_of_model_in_patches_in_layout(regions_fully)
-
-                # plt.imshow(regions_fully[:,:,0])
-                # plt.show()
-
-                K.clear_session()
-                gc.collect()
-                patches = False
-                regions_fully_np, _ = self.extract_text_regions(image_page, patches, cols=num_col_classifier)
-
-                # plt.imshow(regions_fully_np[:,:,0])
-                # plt.show()
-
-                if num_col_classifier > 2:
-                    regions_fully_np[:, :, 0][regions_fully_np[:, :, 0] == 4] = 0
-                else:
-                    regions_fully_np = filter_small_drop_capitals_from_no_patch_layout(regions_fully_np, text_regions_p)
-
-                # plt.imshow(regions_fully_np[:,:,0])
-                # plt.show()
-
-                K.clear_session()
-                gc.collect()
-
-                # plt.imshow(regions_fully[:,:,0])
-                # plt.show()
-
-                regions_fully = boosting_headers_by_longshot_region_segmentation(regions_fully, regions_fully_np, img_only_regions)
-
-                # plt.imshow(regions_fully[:,:,0])
-                # plt.show()
-
-                text_regions_p[:, :][regions_fully[:, :, 0] == 4] = 4
-                text_regions_p[:, :][regions_fully_np[:, :, 0] == 4] = 4
-
-                #plt.imshow(text_regions_p)
-                #plt.show()
-
-                if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
-                    image_page_rotated_n, textline_mask_tot_d, text_regions_p_1_n, regions_fully_n = rotation_not_90_func_full_layout(image_page, textline_mask_tot, text_regions_p, regions_fully, slope_deskew)
-
-                    text_regions_p_1_n = resize_image(text_regions_p_1_n, text_regions_p.shape[0], text_regions_p.shape[1])
-                    textline_mask_tot_d = resize_image(textline_mask_tot_d, text_regions_p.shape[0], text_regions_p.shape[1])
-                    regions_fully_n = resize_image(regions_fully_n, text_regions_p.shape[0], text_regions_p.shape[1])
-                    regions_without_seperators_d = (text_regions_p_1_n[:, :] == 1) * 1
-
-                regions_without_seperators = (text_regions_p[:, :] == 1) * 1  # ( (text_regions_p[:,:]==1) | (text_regions_p[:,:]==2) )*1 #self.return_regions_without_seperators_new(text_regions_p[:,:,0],img_only_regions)
-
-                K.clear_session()
-                gc.collect()
-                img_revised_tab = np.copy(text_regions_p[:, :])
-                self.logger.info("detection of full layout took %ss", str(time.time() - t1))
-                t1 = time.time()
-                pixel_img = 5
-                polygons_of_images = return_contours_of_interested_region(img_revised_tab, pixel_img)
+            img_revised_tab = text_regions_p[:, :]
+            pixel_img = 2
+            polygons_of_images = return_contours_of_interested_region(img_revised_tab, pixel_img)
 
             # plt.imshow(img_revised_tab)
             # plt.show()
+            K.clear_session()
 
-            # print(img_revised_tab.shape,text_regions_p_1_n.shape)
-            # text_regions_p_1_n=resize_image(text_regions_p_1_n,img_revised_tab.shape[0],img_revised_tab.shape[1])
-            # print(np.unique(text_regions_p_1_n),'uni')
+        pixel_img = 4
+        min_area_mar = 0.00001
+        polygons_of_marginals = return_contours_of_interested_region(text_regions_p, pixel_img, min_area_mar)
 
-            text_only = ((img_revised_tab[:, :] == 1)) * 1
-            if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
-                text_only_d = ((text_regions_p_1_n[:, :] == 1)) * 1
-            ##text_only_h=( (img_revised_tab[:,:,0]==2) )*1
+        if self.full_layout:
+            # set first model with second model
+            text_regions_p[:, :][text_regions_p[:, :] == 2] = 5
+            text_regions_p[:, :][text_regions_p[:, :] == 3] = 6
+            text_regions_p[:, :][text_regions_p[:, :] == 4] = 8
 
-            # print(text_only.shape,text_only_d.shape)
-            # plt.imshow(text_only)
+            K.clear_session()
+            # gc.collect()
+            patches = True
+            image_page = image_page.astype(np.uint8)
+
+            # print(type(image_page))
+            regions_fully, regions_fully_only_drop = self.extract_text_regions(image_page, patches, cols=num_col_classifier)
+            text_regions_p[:,:][regions_fully[:,:,0]==6]=6
+
+            regions_fully_only_drop = put_drop_out_from_only_drop_model(regions_fully_only_drop, text_regions_p)
+            regions_fully[:, :, 0][regions_fully_only_drop[:, :, 0] == 4] = 4
+            K.clear_session()
+            gc.collect()
+
+            # plt.imshow(regions_fully[:,:,0])
             # plt.show()
 
-            # plt.imshow(text_only_d)
+            regions_fully = putt_bb_of_drop_capitals_of_model_in_patches_in_layout(regions_fully)
+
+            # plt.imshow(regions_fully[:,:,0])
             # plt.show()
 
-            min_con_area = 0.000005
-            if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
-                contours_only_text, hir_on_text = return_contours_of_image(text_only)
-                contours_only_text_parent = return_parent_contours(contours_only_text, hir_on_text)
-                areas_cnt_text = np.array([cv2.contourArea(contours_only_text_parent[j]) for j in range(len(contours_only_text_parent))])
-                areas_cnt_text = areas_cnt_text / float(text_only.shape[0] * text_only.shape[1])
-                contours_biggest = contours_only_text_parent[np.argmax(areas_cnt_text)]
-                contours_only_text_parent = [contours_only_text_parent[jz] for jz in range(len(contours_only_text_parent)) if areas_cnt_text[jz] > min_con_area]
-                areas_cnt_text_parent = [areas_cnt_text[jz] for jz in range(len(areas_cnt_text)) if areas_cnt_text[jz] > min_con_area]
-
-                index_con_parents = np.argsort(areas_cnt_text_parent)
-                contours_only_text_parent = list(np.array(contours_only_text_parent)[index_con_parents])
-                areas_cnt_text_parent = list(np.array(areas_cnt_text_parent)[index_con_parents])
-
-                cx_bigest_big, cy_biggest_big, _, _, _, _, _ = find_new_features_of_contoures([contours_biggest])
-                cx_bigest, cy_biggest, _, _, _, _, _ = find_new_features_of_contoures(contours_only_text_parent)
-
-                contours_only_text_d, hir_on_text_d = return_contours_of_image(text_only_d)
-                contours_only_text_parent_d = return_parent_contours(contours_only_text_d, hir_on_text_d)
-
-                areas_cnt_text_d = np.array([cv2.contourArea(contours_only_text_parent_d[j]) for j in range(len(contours_only_text_parent_d))])
-                areas_cnt_text_d = areas_cnt_text_d / float(text_only_d.shape[0] * text_only_d.shape[1])
-
-                contours_biggest_d = contours_only_text_parent_d[np.argmax(areas_cnt_text_d)]
-                index_con_parents_d=np.argsort(areas_cnt_text_d)
-                contours_only_text_parent_d=list(np.array(contours_only_text_parent_d)[index_con_parents_d] )
-                areas_cnt_text_d=list(np.array(areas_cnt_text_d)[index_con_parents_d] )
-
-                cx_bigest_d_big, cy_biggest_d_big, _, _, _, _, _ = find_new_features_of_contoures([contours_biggest_d])
-                cx_bigest_d, cy_biggest_d, _, _, _, _, _ = find_new_features_of_contoures(contours_only_text_parent_d)
-                try:
-                    cx_bigest_d_last5=cx_bigest_d[-5:]
-                    cy_biggest_d_last5=cy_biggest_d[-5:]
-                    dists_d = [math.sqrt((cx_bigest_big[0]-cx_bigest_d_last5[j])**2 + (cy_biggest_big[0]-cy_biggest_d_last5[j])**2) for j in range(len(cy_biggest_d_last5))]
-                    ind_largest=len(cx_bigest_d)-5+np.argmin(dists_d)
-                    cx_bigest_d_big[0]=cx_bigest_d[ind_largest]
-                    cy_biggest_d_big[0]=cy_biggest_d[ind_largest]
-                except:
-                    pass
-
-                (h, w) = text_only.shape[:2]
-                center = (w // 2.0, h // 2.0)
-                M = cv2.getRotationMatrix2D(center, slope_deskew, 1.0)
-
-                M_22 = np.array(M)[:2, :2]
-
-                p_big = np.dot(M_22, [cx_bigest_big, cy_biggest_big])
-
-                x_diff = p_big[0] - cx_bigest_d_big
-                y_diff = p_big[1] - cy_biggest_d_big
-
-                # print(p_big)
-                # print(cx_bigest_d_big,cy_biggest_d_big)
-                # print(x_diff,y_diff)
-
-                contours_only_text_parent_d_ordered = []
-                for i in range(len(contours_only_text_parent)):
-                    # img1=np.zeros((text_only.shape[0],text_only.shape[1],3))
-                    # img1=cv2.fillPoly(img1,pts=[contours_only_text_parent[i]] ,color=(1,1,1))
-                    # plt.imshow(img1[:,:,0])
-                    # plt.show()
-
-                    p = np.dot(M_22, [cx_bigest[i], cy_biggest[i]])
-                    # print(p)
-                    p[0] = p[0] - x_diff[0]
-                    p[1] = p[1] - y_diff[0]
-                    # print(p)
-                    # print(cx_bigest_d)
-                    # print(cy_biggest_d)
-                    dists = [math.sqrt((p[0] - cx_bigest_d[j]) ** 2 + (p[1] - cy_biggest_d[j]) ** 2) for j in range(len(cx_bigest_d))]
-                    # print(np.argmin(dists))
-                    contours_only_text_parent_d_ordered.append(contours_only_text_parent_d[np.argmin(dists)])
-                    # img2=np.zeros((text_only.shape[0],text_only.shape[1],3))
-                    # img2=cv2.fillPoly(img2,pts=[contours_only_text_parent_d[np.argmin(dists)]] ,color=(1,1,1))
-                    # plt.imshow(img2[:,:,0])
-                    # plt.show()
-            else:
-                contours_only_text, hir_on_text = return_contours_of_image(text_only)
-                contours_only_text_parent = return_parent_contours(contours_only_text, hir_on_text)
-
-                areas_cnt_text = np.array([cv2.contourArea(contours_only_text_parent[j]) for j in range(len(contours_only_text_parent))])
-                areas_cnt_text = areas_cnt_text / float(text_only.shape[0] * text_only.shape[1])
-
-                contours_biggest = contours_only_text_parent[np.argmax(areas_cnt_text)]
-                contours_only_text_parent = [contours_only_text_parent[jz] for jz in range(len(contours_only_text_parent)) if areas_cnt_text[jz] > min_con_area]
-                areas_cnt_text_parent = [areas_cnt_text[jz] for jz in range(len(areas_cnt_text)) if areas_cnt_text[jz] > min_con_area]
-
-                index_con_parents = np.argsort(areas_cnt_text_parent)
-                contours_only_text_parent = list(np.array(contours_only_text_parent)[index_con_parents])
-                areas_cnt_text_parent = list(np.array(areas_cnt_text_parent)[index_con_parents])
-
-                cx_bigest_big, cy_biggest_big, _, _, _, _, _ = find_new_features_of_contoures([contours_biggest])
-                cx_bigest, cy_biggest, _, _, _, _, _ = find_new_features_of_contoures(contours_only_text_parent)
-                # print(areas_cnt_text_parent,'areas_cnt_text_parent')
-                # print(areas_cnt_text_parent_d,'areas_cnt_text_parent_d')
-                # print(len(contours_only_text_parent),len(contours_only_text_parent_d),'vizzz')
-
-            txt_con_org = get_textregion_contours_in_org_image(contours_only_text_parent, self.image, slope_first)
-            boxes_text, _ = get_text_region_boxes_by_given_contours(contours_only_text_parent)
-            boxes_marginals, _ = get_text_region_boxes_by_given_contours(polygons_of_marginals)
-
-            if not self.curved_line:
-                slopes, all_found_texline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con = self.get_slopes_and_deskew_new(txt_con_org, contours_only_text_parent, textline_mask_tot_ea, image_page_rotated, boxes_text, slope_deskew)
-                slopes_marginals, all_found_texline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, index_by_text_par_con_marginal = self.get_slopes_and_deskew_new(polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea, image_page_rotated, boxes_marginals, slope_deskew)
-
-            else:
-                scale_param = 1
-                all_found_texline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con, slopes = self.get_slopes_and_deskew_new_curved(txt_con_org, contours_only_text_parent, cv2.erode(textline_mask_tot_ea, kernel=self.kernel, iterations=1), image_page_rotated, boxes_text, text_only, num_col_classifier, scale_param, slope_deskew)
-                all_found_texline_polygons = small_textlines_to_parent_adherence2(all_found_texline_polygons, textline_mask_tot_ea, num_col_classifier)
-                all_found_texline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, index_by_text_par_con_marginal, slopes_marginals = self.get_slopes_and_deskew_new_curved(polygons_of_marginals, polygons_of_marginals, cv2.erode(textline_mask_tot_ea, kernel=self.kernel, iterations=1), image_page_rotated, boxes_marginals, text_only, num_col_classifier, scale_param, slope_deskew)
-                all_found_texline_polygons_marginals = small_textlines_to_parent_adherence2(all_found_texline_polygons_marginals, textline_mask_tot_ea, num_col_classifier)
-            index_of_vertical_text_contours = np.array(range(len(slopes)))[(abs(np.array(slopes)) > 60)]
-            contours_text_vertical = [contours_only_text_parent[i] for i in index_of_vertical_text_contours]
-
             K.clear_session()
             gc.collect()
-            # print(index_by_text_par_con,'index_by_text_par_con')
+            patches = False
+            regions_fully_np, _ = self.extract_text_regions(image_page, patches, cols=num_col_classifier)
 
-            if self.full_layout:
-                if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
-                    contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered)[index_by_text_par_con])
-                    text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_texline_polygons, all_found_texline_polygons_h, slopes, slopes_h, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_texline_polygons, slopes, contours_only_text_parent_d_ordered)
-                else:
-                    contours_only_text_parent_d_ordered = None
-                    text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_texline_polygons, all_found_texline_polygons_h, slopes, slopes_h, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_texline_polygons, slopes, contours_only_text_parent_d_ordered)
-                    
-                    
+            # plt.imshow(regions_fully_np[:,:,0])
+            # plt.show()
 
-                if self.plotter:
-                    self.plotter.save_plot_of_layout(text_regions_p, image_page)
-                    self.plotter.save_plot_of_layout_all(text_regions_p, image_page)
+            if num_col_classifier > 2:
+                regions_fully_np[:, :, 0][regions_fully_np[:, :, 0] == 4] = 0
+            else:
+                regions_fully_np = filter_small_drop_capitals_from_no_patch_layout(regions_fully_np, text_regions_p)
 
-                K.clear_session()
-                gc.collect()
+            # plt.imshow(regions_fully_np[:,:,0])
+            # plt.show()
 
-                polygons_of_tabels = []
-                pixel_img = 4
-                polygons_of_drop_capitals = return_contours_of_interested_region_by_min_size(text_regions_p, pixel_img)
-                all_found_texline_polygons = adhere_drop_capital_region_into_cprresponding_textline(text_regions_p, polygons_of_drop_capitals, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_texline_polygons, all_found_texline_polygons_h, kernel=self.kernel, curved_line=self.curved_line)
+            K.clear_session()
+            gc.collect()
 
-                # print(len(contours_only_text_parent_h),len(contours_only_text_parent_h_d_ordered),'contours_only_text_parent_h')
-                pixel_lines = 6
+            # plt.imshow(regions_fully[:,:,0])
+            # plt.show()
 
-                if not self.headers_off:
-                    if np.abs(slope_deskew) < SLOPE_THRESHOLD:
-                        num_col, peaks_neg_fin, matrix_of_lines_ch, spliter_y_new, seperators_closeup_n = find_number_of_columns_in_document(np.repeat(text_regions_p[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines, contours_only_text_parent_h)
-                    else:
-                        num_col_d, peaks_neg_fin_d, matrix_of_lines_ch_d, spliter_y_new_d, seperators_closeup_n_d = find_number_of_columns_in_document(np.repeat(text_regions_p_1_n[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines, contours_only_text_parent_h_d_ordered)
-                elif self.headers_off:
-                    if np.abs(slope_deskew) < SLOPE_THRESHOLD:
-                        num_col, peaks_neg_fin, matrix_of_lines_ch, spliter_y_new, seperators_closeup_n = find_number_of_columns_in_document(np.repeat(text_regions_p[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines)
-                    else:
-                        num_col_d, peaks_neg_fin_d, matrix_of_lines_ch_d, spliter_y_new_d, seperators_closeup_n_d = find_number_of_columns_in_document(np.repeat(text_regions_p_1_n[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines)
+            regions_fully = boosting_headers_by_longshot_region_segmentation(regions_fully, regions_fully_np, img_only_regions)
 
-                # print(peaks_neg_fin,peaks_neg_fin_d,'num_col2')
-                # print(spliter_y_new,spliter_y_new_d,'num_col_classifier')
-                # print(matrix_of_lines_ch.shape,matrix_of_lines_ch_d.shape,'matrix_of_lines_ch')
+            # plt.imshow(regions_fully[:,:,0])
+            # plt.show()
 
-                if num_col_classifier >= 3:
+            text_regions_p[:, :][regions_fully[:, :, 0] == 4] = 4
+            text_regions_p[:, :][regions_fully_np[:, :, 0] == 4] = 4
 
-                    if np.abs(slope_deskew) < SLOPE_THRESHOLD:
-                        regions_without_seperators = regions_without_seperators.astype(np.uint8)
-                        regions_without_seperators = cv2.erode(regions_without_seperators[:, :], self.kernel, iterations=6)
-                        random_pixels_for_image = np.random.randn(regions_without_seperators.shape[0], regions_without_seperators.shape[1])
-                        random_pixels_for_image[random_pixels_for_image < -0.5] = 0
-                        random_pixels_for_image[random_pixels_for_image != 0] = 1
-                        regions_without_seperators[(random_pixels_for_image[:, :] == 1) & (text_regions_p[:, :] == 5)] = 1
-                    else:
-                        regions_without_seperators_d = regions_without_seperators_d.astype(np.uint8)
-                        regions_without_seperators_d = cv2.erode(regions_without_seperators_d[:, :], self.kernel, iterations=6)
-                        random_pixels_for_image = np.random.randn(regions_without_seperators_d.shape[0], regions_without_seperators_d.shape[1])
-                        random_pixels_for_image[random_pixels_for_image < -0.5] = 0
-                        random_pixels_for_image[random_pixels_for_image != 0] = 1
-                        regions_without_seperators_d[(random_pixels_for_image[:, :] == 1) & (text_regions_p_1_n[:, :] == 5)] = 1
+            #plt.imshow(text_regions_p)
+            #plt.show()
 
-                if np.abs(slope_deskew) < SLOPE_THRESHOLD:
-                    boxes = return_boxes_of_images_by_order_of_reading_new(spliter_y_new, regions_without_seperators, matrix_of_lines_ch, num_col_classifier)
-                else:
-                    boxes_d = return_boxes_of_images_by_order_of_reading_new(spliter_y_new_d, regions_without_seperators_d, matrix_of_lines_ch_d, num_col_classifier)
+            if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
+                image_page_rotated_n, textline_mask_tot_d, text_regions_p_1_n, regions_fully_n = rotation_not_90_func_full_layout(image_page, textline_mask_tot, text_regions_p, regions_fully, slope_deskew)
+
+                text_regions_p_1_n = resize_image(text_regions_p_1_n, text_regions_p.shape[0], text_regions_p.shape[1])
+                textline_mask_tot_d = resize_image(textline_mask_tot_d, text_regions_p.shape[0], text_regions_p.shape[1])
+                regions_fully_n = resize_image(regions_fully_n, text_regions_p.shape[0], text_regions_p.shape[1])
+                regions_without_seperators_d = (text_regions_p_1_n[:, :] == 1) * 1
+
+            regions_without_seperators = (text_regions_p[:, :] == 1) * 1  # ( (text_regions_p[:,:]==1) | (text_regions_p[:,:]==2) )*1 #self.return_regions_without_seperators_new(text_regions_p[:,:,0],img_only_regions)
+
+            K.clear_session()
+            gc.collect()
+            img_revised_tab = np.copy(text_regions_p[:, :])
+            self.logger.info("detection of full layout took %ss", str(time.time() - t1))
+            t1 = time.time()
+            pixel_img = 5
+            polygons_of_images = return_contours_of_interested_region(img_revised_tab, pixel_img)
+
+        # plt.imshow(img_revised_tab)
+        # plt.show()
+
+        # print(img_revised_tab.shape,text_regions_p_1_n.shape)
+        # text_regions_p_1_n=resize_image(text_regions_p_1_n,img_revised_tab.shape[0],img_revised_tab.shape[1])
+        # print(np.unique(text_regions_p_1_n),'uni')
+
+        text_only = ((img_revised_tab[:, :] == 1)) * 1
+        if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
+            text_only_d = ((text_regions_p_1_n[:, :] == 1)) * 1
+        ##text_only_h=( (img_revised_tab[:,:,0]==2) )*1
+
+        # print(text_only.shape,text_only_d.shape)
+        # plt.imshow(text_only)
+        # plt.show()
+
+        # plt.imshow(text_only_d)
+        # plt.show()
+
+        min_con_area = 0.000005
+        if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
+            contours_only_text, hir_on_text = return_contours_of_image(text_only)
+            contours_only_text_parent = return_parent_contours(contours_only_text, hir_on_text)
+            areas_cnt_text = np.array([cv2.contourArea(contours_only_text_parent[j]) for j in range(len(contours_only_text_parent))])
+            areas_cnt_text = areas_cnt_text / float(text_only.shape[0] * text_only.shape[1])
+            contours_biggest = contours_only_text_parent[np.argmax(areas_cnt_text)]
+            contours_only_text_parent = [contours_only_text_parent[jz] for jz in range(len(contours_only_text_parent)) if areas_cnt_text[jz] > min_con_area]
+            areas_cnt_text_parent = [areas_cnt_text[jz] for jz in range(len(areas_cnt_text)) if areas_cnt_text[jz] > min_con_area]
+
+            index_con_parents = np.argsort(areas_cnt_text_parent)
+            contours_only_text_parent = list(np.array(contours_only_text_parent)[index_con_parents])
+            areas_cnt_text_parent = list(np.array(areas_cnt_text_parent)[index_con_parents])
+
+            cx_bigest_big, cy_biggest_big, _, _, _, _, _ = find_new_features_of_contoures([contours_biggest])
+            cx_bigest, cy_biggest, _, _, _, _, _ = find_new_features_of_contoures(contours_only_text_parent)
+
+            contours_only_text_d, hir_on_text_d = return_contours_of_image(text_only_d)
+            contours_only_text_parent_d = return_parent_contours(contours_only_text_d, hir_on_text_d)
+
+            areas_cnt_text_d = np.array([cv2.contourArea(contours_only_text_parent_d[j]) for j in range(len(contours_only_text_parent_d))])
+            areas_cnt_text_d = areas_cnt_text_d / float(text_only_d.shape[0] * text_only_d.shape[1])
+
+            contours_biggest_d = contours_only_text_parent_d[np.argmax(areas_cnt_text_d)]
+            index_con_parents_d=np.argsort(areas_cnt_text_d)
+            contours_only_text_parent_d=list(np.array(contours_only_text_parent_d)[index_con_parents_d] )
+            areas_cnt_text_d=list(np.array(areas_cnt_text_d)[index_con_parents_d] )
+
+            cx_bigest_d_big, cy_biggest_d_big, _, _, _, _, _ = find_new_features_of_contoures([contours_biggest_d])
+            cx_bigest_d, cy_biggest_d, _, _, _, _, _ = find_new_features_of_contoures(contours_only_text_parent_d)
+            try:
+                cx_bigest_d_last5=cx_bigest_d[-5:]
+                cy_biggest_d_last5=cy_biggest_d[-5:]
+                dists_d = [math.sqrt((cx_bigest_big[0]-cx_bigest_d_last5[j])**2 + (cy_biggest_big[0]-cy_biggest_d_last5[j])**2) for j in range(len(cy_biggest_d_last5))]
+                ind_largest=len(cx_bigest_d)-5+np.argmin(dists_d)
+                cx_bigest_d_big[0]=cx_bigest_d[ind_largest]
+                cy_biggest_d_big[0]=cy_biggest_d[ind_largest]
+            except:
+                pass
+
+            (h, w) = text_only.shape[:2]
+            center = (w // 2.0, h // 2.0)
+            M = cv2.getRotationMatrix2D(center, slope_deskew, 1.0)
+
+            M_22 = np.array(M)[:2, :2]
+
+            p_big = np.dot(M_22, [cx_bigest_big, cy_biggest_big])
+
+            x_diff = p_big[0] - cx_bigest_d_big
+            y_diff = p_big[1] - cy_biggest_d_big
+
+            # print(p_big)
+            # print(cx_bigest_d_big,cy_biggest_d_big)
+            # print(x_diff,y_diff)
+
+            contours_only_text_parent_d_ordered = []
+            for i in range(len(contours_only_text_parent)):
+                # img1=np.zeros((text_only.shape[0],text_only.shape[1],3))
+                # img1=cv2.fillPoly(img1,pts=[contours_only_text_parent[i]] ,color=(1,1,1))
+                # plt.imshow(img1[:,:,0])
+                # plt.show()
+
+                p = np.dot(M_22, [cx_bigest[i], cy_biggest[i]])
+                # print(p)
+                p[0] = p[0] - x_diff[0]
+                p[1] = p[1] - y_diff[0]
+                # print(p)
+                # print(cx_bigest_d)
+                # print(cy_biggest_d)
+                dists = [math.sqrt((p[0] - cx_bigest_d[j]) ** 2 + (p[1] - cy_biggest_d[j]) ** 2) for j in range(len(cx_bigest_d))]
+                # print(np.argmin(dists))
+                contours_only_text_parent_d_ordered.append(contours_only_text_parent_d[np.argmin(dists)])
+                # img2=np.zeros((text_only.shape[0],text_only.shape[1],3))
+                # img2=cv2.fillPoly(img2,pts=[contours_only_text_parent_d[np.argmin(dists)]] ,color=(1,1,1))
+                # plt.imshow(img2[:,:,0])
+                # plt.show()
+        else:
+            contours_only_text, hir_on_text = return_contours_of_image(text_only)
+            contours_only_text_parent = return_parent_contours(contours_only_text, hir_on_text)
+
+            areas_cnt_text = np.array([cv2.contourArea(contours_only_text_parent[j]) for j in range(len(contours_only_text_parent))])
+            areas_cnt_text = areas_cnt_text / float(text_only.shape[0] * text_only.shape[1])
+
+            contours_biggest = contours_only_text_parent[np.argmax(areas_cnt_text)]
+            contours_only_text_parent = [contours_only_text_parent[jz] for jz in range(len(contours_only_text_parent)) if areas_cnt_text[jz] > min_con_area]
+            areas_cnt_text_parent = [areas_cnt_text[jz] for jz in range(len(areas_cnt_text)) if areas_cnt_text[jz] > min_con_area]
+
+            index_con_parents = np.argsort(areas_cnt_text_parent)
+            contours_only_text_parent = list(np.array(contours_only_text_parent)[index_con_parents])
+            areas_cnt_text_parent = list(np.array(areas_cnt_text_parent)[index_con_parents])
+
+            cx_bigest_big, cy_biggest_big, _, _, _, _, _ = find_new_features_of_contoures([contours_biggest])
+            cx_bigest, cy_biggest, _, _, _, _, _ = find_new_features_of_contoures(contours_only_text_parent)
+            # print(areas_cnt_text_parent,'areas_cnt_text_parent')
+            # print(areas_cnt_text_parent_d,'areas_cnt_text_parent_d')
+            # print(len(contours_only_text_parent),len(contours_only_text_parent_d),'vizzz')
+
+        txt_con_org = get_textregion_contours_in_org_image(contours_only_text_parent, self.image, slope_first)
+        boxes_text, _ = get_text_region_boxes_by_given_contours(contours_only_text_parent)
+        boxes_marginals, _ = get_text_region_boxes_by_given_contours(polygons_of_marginals)
+
+        if not self.curved_line:
+            slopes, all_found_texline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con = self.get_slopes_and_deskew_new(txt_con_org, contours_only_text_parent, textline_mask_tot_ea, image_page_rotated, boxes_text, slope_deskew)
+            slopes_marginals, all_found_texline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, index_by_text_par_con_marginal = self.get_slopes_and_deskew_new(polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea, image_page_rotated, boxes_marginals, slope_deskew)
+
+        else:
+            scale_param = 1
+            all_found_texline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con, slopes = self.get_slopes_and_deskew_new_curved(txt_con_org, contours_only_text_parent, cv2.erode(textline_mask_tot_ea, kernel=self.kernel, iterations=1), image_page_rotated, boxes_text, text_only, num_col_classifier, scale_param, slope_deskew)
+            all_found_texline_polygons = small_textlines_to_parent_adherence2(all_found_texline_polygons, textline_mask_tot_ea, num_col_classifier)
+            all_found_texline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, index_by_text_par_con_marginal, slopes_marginals = self.get_slopes_and_deskew_new_curved(polygons_of_marginals, polygons_of_marginals, cv2.erode(textline_mask_tot_ea, kernel=self.kernel, iterations=1), image_page_rotated, boxes_marginals, text_only, num_col_classifier, scale_param, slope_deskew)
+            all_found_texline_polygons_marginals = small_textlines_to_parent_adherence2(all_found_texline_polygons_marginals, textline_mask_tot_ea, num_col_classifier)
+        index_of_vertical_text_contours = np.array(range(len(slopes)))[(abs(np.array(slopes)) > 60)]
+        contours_text_vertical = [contours_only_text_parent[i] for i in index_of_vertical_text_contours]
+
+        K.clear_session()
+        gc.collect()
+        # print(index_by_text_par_con,'index_by_text_par_con')
+
+        if self.full_layout:
+            if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
+                contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered)[index_by_text_par_con])
+                text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_texline_polygons, all_found_texline_polygons_h, slopes, slopes_h, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_texline_polygons, slopes, contours_only_text_parent_d_ordered)
+            else:
+                contours_only_text_parent_d_ordered = None
+                text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_texline_polygons, all_found_texline_polygons_h, slopes, slopes_h, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_texline_polygons, slopes, contours_only_text_parent_d_ordered)
+                
+                
 
             if self.plotter:
-                self.plotter.write_images_into_directory(polygons_of_images, image_page)
+                self.plotter.save_plot_of_layout(text_regions_p, image_page)
+                self.plotter.save_plot_of_layout_all(text_regions_p, image_page)
 
-            if self.full_layout:
+            K.clear_session()
+            gc.collect()
+
+            polygons_of_tabels = []
+            pixel_img = 4
+            polygons_of_drop_capitals = return_contours_of_interested_region_by_min_size(text_regions_p, pixel_img)
+            all_found_texline_polygons = adhere_drop_capital_region_into_cprresponding_textline(text_regions_p, polygons_of_drop_capitals, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_texline_polygons, all_found_texline_polygons_h, kernel=self.kernel, curved_line=self.curved_line)
+
+            # print(len(contours_only_text_parent_h),len(contours_only_text_parent_h_d_ordered),'contours_only_text_parent_h')
+            pixel_lines = 6
+
+            if not self.headers_off:
                 if np.abs(slope_deskew) < SLOPE_THRESHOLD:
-                    order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot)
+                    num_col, peaks_neg_fin, matrix_of_lines_ch, spliter_y_new, seperators_closeup_n = find_number_of_columns_in_document(np.repeat(text_regions_p[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines, contours_only_text_parent_h)
                 else:
-                    order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered, boxes_d, textline_mask_tot_d)
+                    num_col_d, peaks_neg_fin_d, matrix_of_lines_ch_d, spliter_y_new_d, seperators_closeup_n_d = find_number_of_columns_in_document(np.repeat(text_regions_p_1_n[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines, contours_only_text_parent_h_d_ordered)
+            elif self.headers_off:
+                if np.abs(slope_deskew) < SLOPE_THRESHOLD:
+                    num_col, peaks_neg_fin, matrix_of_lines_ch, spliter_y_new, seperators_closeup_n = find_number_of_columns_in_document(np.repeat(text_regions_p[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines)
+                else:
+                    num_col_d, peaks_neg_fin_d, matrix_of_lines_ch_d, spliter_y_new_d, seperators_closeup_n_d = find_number_of_columns_in_document(np.repeat(text_regions_p_1_n[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines)
 
-                self.write_into_page_xml_full(contours_only_text_parent, contours_only_text_parent_h, page_coord, self.dir_out, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, polygons_of_images, polygons_of_tabels, polygons_of_drop_capitals, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals)
+            # print(peaks_neg_fin,peaks_neg_fin_d,'num_col2')
+            # print(spliter_y_new,spliter_y_new_d,'num_col_classifier')
+            # print(matrix_of_lines_ch.shape,matrix_of_lines_ch_d.shape,'matrix_of_lines_ch')
+
+            if num_col_classifier >= 3:
+
+                if np.abs(slope_deskew) < SLOPE_THRESHOLD:
+                    regions_without_seperators = regions_without_seperators.astype(np.uint8)
+                    regions_without_seperators = cv2.erode(regions_without_seperators[:, :], self.kernel, iterations=6)
+                    random_pixels_for_image = np.random.randn(regions_without_seperators.shape[0], regions_without_seperators.shape[1])
+                    random_pixels_for_image[random_pixels_for_image < -0.5] = 0
+                    random_pixels_for_image[random_pixels_for_image != 0] = 1
+                    regions_without_seperators[(random_pixels_for_image[:, :] == 1) & (text_regions_p[:, :] == 5)] = 1
+                else:
+                    regions_without_seperators_d = regions_without_seperators_d.astype(np.uint8)
+                    regions_without_seperators_d = cv2.erode(regions_without_seperators_d[:, :], self.kernel, iterations=6)
+                    random_pixels_for_image = np.random.randn(regions_without_seperators_d.shape[0], regions_without_seperators_d.shape[1])
+                    random_pixels_for_image[random_pixels_for_image < -0.5] = 0
+                    random_pixels_for_image[random_pixels_for_image != 0] = 1
+                    regions_without_seperators_d[(random_pixels_for_image[:, :] == 1) & (text_regions_p_1_n[:, :] == 5)] = 1
+
+            if np.abs(slope_deskew) < SLOPE_THRESHOLD:
+                boxes = return_boxes_of_images_by_order_of_reading_new(spliter_y_new, regions_without_seperators, matrix_of_lines_ch, num_col_classifier)
             else:
-                contours_only_text_parent_h = None
-                # self.logger.debug('bura galmir?')
-                if np.abs(slope_deskew) < SLOPE_THRESHOLD:
-                    #contours_only_text_parent = list(np.array(contours_only_text_parent)[index_by_text_par_con])
-                    order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot)
-                else:
-                    contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered)[index_by_text_par_con])
-                    order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h, boxes_d, textline_mask_tot_d)
-                # order_text_new , id_of_texts_tot=self.do_order_of_regions(contours_only_text_parent,contours_only_text_parent_h,boxes,textline_mask_tot)
-                self.write_into_page_xml(txt_con_org, page_coord, self.dir_out, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, self.curved_line, slopes, slopes_marginals)
+                boxes_d = return_boxes_of_images_by_order_of_reading_new(spliter_y_new_d, regions_without_seperators_d, matrix_of_lines_ch_d, num_col_classifier)
+
+        if self.plotter:
+            self.plotter.write_images_into_directory(polygons_of_images, image_page)
+
+        if self.full_layout:
+            if np.abs(slope_deskew) < SLOPE_THRESHOLD:
+                order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot)
+            else:
+                order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered, boxes_d, textline_mask_tot_d)
+
+            self.write_into_page_xml_full(contours_only_text_parent, contours_only_text_parent_h, page_coord, self.dir_out, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, polygons_of_images, polygons_of_tabels, polygons_of_drop_capitals, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals)
+        else:
+            contours_only_text_parent_h = None
+            # self.logger.debug('bura galmir?')
+            if np.abs(slope_deskew) < SLOPE_THRESHOLD:
+                #contours_only_text_parent = list(np.array(contours_only_text_parent)[index_by_text_par_con])
+                order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot)
+            else:
+                contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered)[index_by_text_par_con])
+                order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h, boxes_d, textline_mask_tot_d)
+            # order_text_new , id_of_texts_tot=self.do_order_of_regions(contours_only_text_parent,contours_only_text_parent_h,boxes,textline_mask_tot)
+            self.write_into_page_xml(txt_con_org, page_coord, self.dir_out, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, self.curved_line, slopes, slopes_marginals)
 
         self.logger.info("Job done in %ss", str(time.time() - t1))

From 8f82e81551953f5cdd89268a6b4472dcdb20f7bc Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Fri, 5 Feb 2021 17:35:29 +0100
Subject: [PATCH 37/89] remove unnecessary patches assignment, simplify if-else

---
 sbb_newspapers_org_image/eynollah.py | 236 ++++++++++++---------------
 1 file changed, 108 insertions(+), 128 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index 5bf67e8..4b2e5bc 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -499,7 +499,26 @@ class eynollah:
         img_width_model = model.layers[len(model.layers) - 1].output_shape[2]
         n_classes = model.layers[len(model.layers) - 1].output_shape[3]
 
-        if patches:
+
+        if not patches:
+            img_h_page = img.shape[0]
+            img_w_page = img.shape[1]
+            img = img / float(255.0)
+            img = resize_image(img, img_height_model, img_width_model)
+
+            label_p_pred = model.predict(img.reshape(1, img.shape[0], img.shape[1], img.shape[2]))
+
+            seg = np.argmax(label_p_pred, axis=3)[0]
+            seg_color = np.repeat(seg[:, :, np.newaxis], 3, axis=2)
+            prediction_true = resize_image(seg_color, img_h_page, img_w_page)
+            prediction_true = prediction_true.astype(np.uint8)
+
+            del img
+            del seg_color
+            del label_p_pred
+            del seg
+
+        else:
             if img.shape[0] < img_height_model:
                 img = resize_image(img, img_height_model, img.shape[1])
 
@@ -599,39 +618,18 @@ class eynollah:
             del seg_color
             del seg
             del img_patch
-
-        if not patches:
-            img_h_page = img.shape[0]
-            img_w_page = img.shape[1]
-            img = img / float(255.0)
-            img = resize_image(img, img_height_model, img_width_model)
-
-            label_p_pred = model.predict(img.reshape(1, img.shape[0], img.shape[1], img.shape[2]))
-
-            seg = np.argmax(label_p_pred, axis=3)[0]
-            seg_color = np.repeat(seg[:, :, np.newaxis], 3, axis=2)
-            prediction_true = resize_image(seg_color, img_h_page, img_w_page)
-            prediction_true = prediction_true.astype(np.uint8)
-
-            del img
-            del seg_color
-            del label_p_pred
-            del seg
-        del model
         gc.collect()
-
         return prediction_true
 
     def early_page_for_num_of_column_classification(self):
         self.logger.debug("enter early_page_for_num_of_column_classification")
         img = cv2.imread(self.image_filename)
         img = img.astype(np.uint8)
-        patches = False
         model_page, session_page = self.start_new_session_and_model(self.model_page_dir)
         for ii in range(1):
             img = cv2.GaussianBlur(img, (5, 5), 0)
 
-        img_page_prediction = self.do_prediction(patches, img, model_page)
+        img_page_prediction = self.do_prediction(False, img, model_page)
 
         imgray = cv2.cvtColor(img_page_prediction, cv2.COLOR_BGR2GRAY)
         _, thresh = cv2.threshold(imgray, 0, 255, 0)
@@ -664,12 +662,11 @@ class eynollah:
 
     def extract_page(self):
         self.logger.debug("enter extract_page")
-        patches = False
         model_page, session_page = self.start_new_session_and_model(self.model_page_dir)
         for ii in range(1):
             img = cv2.GaussianBlur(self.image, (5, 5), 0)
 
-        img_page_prediction = self.do_prediction(patches, img, model_page)
+        img_page_prediction = self.do_prediction(False, img, model_page)
 
         imgray = cv2.cvtColor(img_page_prediction, cv2.COLOR_BGR2GRAY)
         _, thresh = cv2.threshold(imgray, 0, 255, 0)
@@ -715,92 +712,88 @@ class eynollah:
         img_height_h = img.shape[0]
         img_width_h = img.shape[1]
 
-        if patches:
-            model_region, session_region = self.start_new_session_and_model(self.model_region_dir_fully)
-        if not patches:
-            model_region, session_region = self.start_new_session_and_model(self.model_region_dir_fully_np)
-
-        if patches and cols == 1:
-            img2 = otsu_copy_binary(img)
-            img2 = img2.astype(np.uint8)
-            img2 = resize_image(img2, int(img_height_h * 0.7), int(img_width_h * 0.7))
-            marginal_of_patch_percent = 0.1
-            prediction_regions2 = self.do_prediction(patches, img2, model_region, marginal_of_patch_percent)
-            prediction_regions2 = resize_image(prediction_regions2, img_height_h, img_width_h)
-
-        if patches and cols == 2:
-            img2 = otsu_copy_binary(img)
-            img2 = img2.astype(np.uint8)
-            img2 = resize_image(img2, int(img_height_h * 0.4), int(img_width_h * 0.4))
-            marginal_of_patch_percent = 0.1
-            prediction_regions2 = self.do_prediction(patches, img2, model_region, marginal_of_patch_percent)
-            prediction_regions2 = resize_image(prediction_regions2, img_height_h, img_width_h)
-
-        elif patches and cols > 2:
-            img2 = otsu_copy_binary(img)
-            img2 = img2.astype(np.uint8)
-            img2 = resize_image(img2, int(img_height_h * 0.3), int(img_width_h * 0.3))
-            marginal_of_patch_percent = 0.1
-            prediction_regions2 = self.do_prediction(patches, img2, model_region, marginal_of_patch_percent)
-            prediction_regions2 = resize_image(prediction_regions2, img_height_h, img_width_h)
-
-        if patches and cols == 2:
-            img = otsu_copy_binary(img)
-            img = img.astype(np.uint8)
-            if img_width_h >= 2000:
-                img = resize_image(img, int(img_height_h * 0.9), int(img_width_h * 0.9))
-            img = img.astype(np.uint8)
-
-        if patches and cols == 1:
-            img = otsu_copy_binary(img)
-            img = img.astype(np.uint8)
-            img = resize_image(img, int(img_height_h * 0.5), int(img_width_h * 0.5))
-            img = img.astype(np.uint8)
-
-        if patches and cols == 3:
-            if (self.scale_x == 1 and img_width_h > 3000) or (self.scale_x != 1 and img_width_h > 2800):
-                img = otsu_copy_binary(img)
-                img = img.astype(np.uint8)
-                img = resize_image(img, int(img_height_h * 2800 / float(img_width_h)), 2800)
-            else:
-                img = otsu_copy_binary(img)
-                img = img.astype(np.uint8)
-
-        if patches and cols == 4:
-            #print(self.scale_x,img_width_h,'scale')
-            if (self.scale_x == 1 and img_width_h > 4000) or (self.scale_x != 1 and img_width_h > 3700):
-                img = otsu_copy_binary(img)
-                img = img.astype(np.uint8)
-                img= resize_image(img, int(img_height_h * 3700 / float(img_width_h)), 3700)
-            else:
-                img = otsu_copy_binary(img)#self.otsu_copy(img)
-                img = img.astype(np.uint8)
-                img= resize_image(img, int(img_height_h * 0.9), int(img_width_h * 0.9))
-
-        if patches and cols==5:
-            if self.scale_x == 1 and img_width_h > 5000:
-                img = otsu_copy_binary(img)
-                img = img.astype(np.uint8)
-                img= resize_image(img, int(img_height_h * 0.7), int(img_width_h * 0.7))
-            else:
-                img = otsu_copy_binary(img)
-                img = img.astype(np.uint8)
-                img= resize_image(img, int(img_height_h * 0.9), int(img_width_h * 0.9) )
-
-        if patches and cols>=6:
-            if img_width_h > 5600:
-                img = otsu_copy_binary(img)
-                img = img.astype(np.uint8)
-                img= resize_image(img, int(img_height_h * 5600 / float(img_width_h)), 5600)
-            else:
-                img = otsu_copy_binary(img)
-                img = img.astype(np.uint8)
-                img= resize_image(img, int(img_height_h * 0.9), int(img_width_h * 0.9))
+        model_region, session_region = self.start_new_session_and_model(self.model_region_dir_fully if patches else self.model_region_dir_fully_np)
 
         if not patches:
             img = otsu_copy_binary(img)
             img = img.astype(np.uint8)
             prediction_regions2 = None
+        else:
+            if cols == 1:
+                img2 = otsu_copy_binary(img)
+                img2 = img2.astype(np.uint8)
+                img2 = resize_image(img2, int(img_height_h * 0.7), int(img_width_h * 0.7))
+                marginal_of_patch_percent = 0.1
+                prediction_regions2 = self.do_prediction(patches, img2, model_region, marginal_of_patch_percent)
+                prediction_regions2 = resize_image(prediction_regions2, img_height_h, img_width_h)
+
+            if cols == 2:
+                img2 = otsu_copy_binary(img)
+                img2 = img2.astype(np.uint8)
+                img2 = resize_image(img2, int(img_height_h * 0.4), int(img_width_h * 0.4))
+                marginal_of_patch_percent = 0.1
+                prediction_regions2 = self.do_prediction(patches, img2, model_region, marginal_of_patch_percent)
+                prediction_regions2 = resize_image(prediction_regions2, img_height_h, img_width_h)
+
+            elif cols > 2:
+                img2 = otsu_copy_binary(img)
+                img2 = img2.astype(np.uint8)
+                img2 = resize_image(img2, int(img_height_h * 0.3), int(img_width_h * 0.3))
+                marginal_of_patch_percent = 0.1
+                prediction_regions2 = self.do_prediction(patches, img2, model_region, marginal_of_patch_percent)
+                prediction_regions2 = resize_image(prediction_regions2, img_height_h, img_width_h)
+
+            if cols == 2:
+                img = otsu_copy_binary(img)
+                img = img.astype(np.uint8)
+                if img_width_h >= 2000:
+                    img = resize_image(img, int(img_height_h * 0.9), int(img_width_h * 0.9))
+                img = img.astype(np.uint8)
+
+            if cols == 1:
+                img = otsu_copy_binary(img)
+                img = img.astype(np.uint8)
+                img = resize_image(img, int(img_height_h * 0.5), int(img_width_h * 0.5))
+                img = img.astype(np.uint8)
+
+            if cols == 3:
+                if (self.scale_x == 1 and img_width_h > 3000) or (self.scale_x != 1 and img_width_h > 2800):
+                    img = otsu_copy_binary(img)
+                    img = img.astype(np.uint8)
+                    img = resize_image(img, int(img_height_h * 2800 / float(img_width_h)), 2800)
+                else:
+                    img = otsu_copy_binary(img)
+                    img = img.astype(np.uint8)
+
+            if cols == 4:
+                if (self.scale_x == 1 and img_width_h > 4000) or (self.scale_x != 1 and img_width_h > 3700):
+                    img = otsu_copy_binary(img)
+                    img = img.astype(np.uint8)
+                    img= resize_image(img, int(img_height_h * 3700 / float(img_width_h)), 3700)
+                else:
+                    img = otsu_copy_binary(img)#self.otsu_copy(img)
+                    img = img.astype(np.uint8)
+                    img= resize_image(img, int(img_height_h * 0.9), int(img_width_h * 0.9))
+
+            if cols == 5:
+                if self.scale_x == 1 and img_width_h > 5000:
+                    img = otsu_copy_binary(img)
+                    img = img.astype(np.uint8)
+                    img= resize_image(img, int(img_height_h * 0.7), int(img_width_h * 0.7))
+                else:
+                    img = otsu_copy_binary(img)
+                    img = img.astype(np.uint8)
+                    img= resize_image(img, int(img_height_h * 0.9), int(img_width_h * 0.9) )
+
+            if cols >= 6:
+                if img_width_h > 5600:
+                    img = otsu_copy_binary(img)
+                    img = img.astype(np.uint8)
+                    img= resize_image(img, int(img_height_h * 5600 / float(img_width_h)), 5600)
+                else:
+                    img = otsu_copy_binary(img)
+                    img = img.astype(np.uint8)
+                    img= resize_image(img, int(img_height_h * 0.9), int(img_width_h * 0.9))
 
         marginal_of_patch_percent = 0.1
         prediction_regions = self.do_prediction(patches, img, model_region, marginal_of_patch_percent)
@@ -1105,10 +1098,7 @@ class eynollah:
     def textline_contours(self, img, patches, scaler_h, scaler_w):
         self.logger.debug('enter textline_contours')
 
-        if patches:
-            model_textline, session_textline = self.start_new_session_and_model(self.model_textline_dir)
-        if not patches:
-            model_textline, session_textline = self.start_new_session_and_model(self.model_textline_dir_np)
+        model_textline, session_textline = self.start_new_session_and_model(self.model_textline_dir if patches else self.model_textline_dir_np)
         img = img.astype(np.uint8)
         img_org = np.copy(img)
         img_h = img_org.shape[0]
@@ -1116,17 +1106,12 @@ class eynollah:
         img = resize_image(img_org, int(img_org.shape[0] * scaler_h), int(img_org.shape[1] * scaler_w))
         prediction_textline = self.do_prediction(patches, img, model_textline)
         prediction_textline = resize_image(prediction_textline, img_h, img_w)
-        patches = False
-        prediction_textline_longshot = self.do_prediction(patches, img, model_textline)
+        prediction_textline_longshot = self.do_prediction(False, img, model_textline)
         prediction_textline_longshot_true_size = resize_image(prediction_textline_longshot, img_h, img_w)
-
-        # prediction_textline_streched=self.do_prediction(patches,img,model_textline)
-        # prediction_textline_streched= resize_image(prediction_textline_streched, img_h, img_w)
         ##plt.imshow(prediction_textline_streched[:,:,0])
         ##plt.show()
 
         session_textline.close()
-
         del model_textline
         del session_textline
         del img
@@ -1697,7 +1682,6 @@ class eynollah:
         model_region, session_region = self.start_new_session_and_model(self.model_region_dir_p_ens)
 
         gaussian_filter=False
-        patches=True
         binary=False
         ratio_y=1.3
         ratio_x=1
@@ -1714,7 +1698,7 @@ class eynollah:
             img= cv2.GaussianBlur(img,(5,5),0)
             img = img.astype(np.uint16)
 
-        prediction_regions_org_y = self.do_prediction(patches,img,model_region)
+        prediction_regions_org_y = self.do_prediction(True, img, model_region)
         prediction_regions_org_y = resize_image(prediction_regions_org_y, img_height_h, img_width_h )
 
         #plt.imshow(prediction_regions_org_y[:,:,0])
@@ -1740,7 +1724,7 @@ class eynollah:
             img = cv2.GaussianBlur(img, (5,5 ), 0)
             img = img.astype(np.uint16)
 
-        prediction_regions_org = self.do_prediction(patches,img,model_region)
+        prediction_regions_org = self.do_prediction(True, img, model_region)
         prediction_regions_org = resize_image(prediction_regions_org, img_height_h, img_width_h )
 
         ##plt.imshow(prediction_regions_org[:,:,0])
@@ -1757,7 +1741,6 @@ class eynollah:
         model_region, session_region = self.start_new_session_and_model(self.model_region_dir_p2)
 
         gaussian_filter=False
-        patches=True
         binary=False
         ratio_x=1
         ratio_y=1
@@ -1776,7 +1759,7 @@ class eynollah:
             img = img.astype(np.uint16)
 
         marginal_patch=0.2
-        prediction_regions_org2=self.do_prediction(patches,img,model_region,marginal_patch)
+        prediction_regions_org2=self.do_prediction(True, img, model_region, marginal_patch)
 
         prediction_regions_org2=resize_image(prediction_regions_org2, img_height_h, img_width_h )
 
@@ -2224,16 +2207,15 @@ class eynollah:
         num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines = self.run_graphics_and_columns(text_regions_p_1, num_column_is_classified)
         self.logger.info("Graphics detection took %ss ", str(time.time() - t1))
 
-        #print(num_col, "num_colnum_col")
         if not num_col:
             self.logger.info("No columns detected, outputting an empty PAGE-XML")
             self.write_into_page_xml([], page_coord, self.dir_out, [], [], [], [], [], [], [], [], self.curved_line, [], [])
             self.logger.info("Job done in %ss", str(time.time() - t1))
             return
-        patches = True
+
         scaler_h_textline = 1  # 1.2#1.2
         scaler_w_textline = 1  # 0.9#1
-        textline_mask_tot_ea, textline_mask_tot_long_shot = self.textline_contours(image_page, patches, scaler_h_textline, scaler_w_textline)
+        textline_mask_tot_ea, textline_mask_tot_long_shot = self.textline_contours(image_page, True, scaler_h_textline, scaler_w_textline)
 
         K.clear_session()
         gc.collect()
@@ -2354,11 +2336,10 @@ class eynollah:
 
             K.clear_session()
             # gc.collect()
-            patches = True
             image_page = image_page.astype(np.uint8)
 
             # print(type(image_page))
-            regions_fully, regions_fully_only_drop = self.extract_text_regions(image_page, patches, cols=num_col_classifier)
+            regions_fully, regions_fully_only_drop = self.extract_text_regions(image_page, True, cols=num_col_classifier)
             text_regions_p[:,:][regions_fully[:,:,0]==6]=6
 
             regions_fully_only_drop = put_drop_out_from_only_drop_model(regions_fully_only_drop, text_regions_p)
@@ -2376,8 +2357,7 @@ class eynollah:
 
             K.clear_session()
             gc.collect()
-            patches = False
-            regions_fully_np, _ = self.extract_text_regions(image_page, patches, cols=num_col_classifier)
+            regions_fully_np, _ = self.extract_text_regions(image_page, False, cols=num_col_classifier)
 
             # plt.imshow(regions_fully_np[:,:,0])
             # plt.show()

From b06acbf754077a8a294c2c0270dd6c3f6fa28528 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Fri, 5 Feb 2021 18:12:20 +0100
Subject: [PATCH 38/89] untangle run even further

---
 sbb_newspapers_org_image/eynollah.py | 91 +++++++++++++++++-----------
 1 file changed, 54 insertions(+), 37 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index 4b2e5bc..1723730 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -2187,32 +2187,7 @@ class eynollah:
                 self.get_image_and_scales_after_enhancing(img_org, img_res)
         return img_res, is_image_enhanced, num_column_is_classified
 
-
-    def run(self):
-        """
-        Get image and scales, then extract the page of scanned image
-        """
-        self.logger.debug("enter run")
-        is_image_enhanced = False
-
-        t1 = time.time()
-        img_res, is_image_enhanced, num_column_is_classified = self.run_enhancement()
-        self.logger.info("Enhancing took %ss ", str(time.time() - t1))
-
-        t1 = time.time()
-        text_regions_p_1 = self.get_regions_from_xy_2models(img_res, is_image_enhanced)
-        self.logger.info("Textregion detection took %ss ", str(time.time() - t1))
-
-        t1 = time.time()
-        num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines = self.run_graphics_and_columns(text_regions_p_1, num_column_is_classified)
-        self.logger.info("Graphics detection took %ss ", str(time.time() - t1))
-
-        if not num_col:
-            self.logger.info("No columns detected, outputting an empty PAGE-XML")
-            self.write_into_page_xml([], page_coord, self.dir_out, [], [], [], [], [], [], [], [], self.curved_line, [], [])
-            self.logger.info("Job done in %ss", str(time.time() - t1))
-            return
-
+    def run_textline(self, image_page):
         scaler_h_textline = 1  # 1.2#1.2
         scaler_w_textline = 1  # 0.9#1
         textline_mask_tot_ea, textline_mask_tot_long_shot = self.textline_contours(image_page, True, scaler_h_textline, scaler_w_textline)
@@ -2220,28 +2195,33 @@ class eynollah:
         K.clear_session()
         gc.collect()
         #print(np.unique(textline_mask_tot_ea[:, :]), "textline")
-        if self.plotter:
-            self.plotter.save_plot_of_textlines(textline_mask_tot_ea, image_page)
-        self.logger.info("textline detection took %ss", str(time.time() - t1))
-        t1 = time.time()
         # plt.imshow(textline_mask_tot_ea)
         # plt.show()
+        if self.plotter:
+            self.plotter.save_plot_of_textlines(textline_mask_tot_ea, image_page)
+        return textline_mask_tot_ea, textline_mask_tot_long_shot
 
+    def run_deskew(self, textline_mask_tot_ea):
         sigma = 2
         main_page_deskew = True
         slope_deskew = return_deskew_slop(cv2.erode(textline_mask_tot_ea, self.kernel, iterations=2), sigma, main_page_deskew, plotter=self.plotter)
-        slope_first = 0  # return_deskew_slop(cv2.erode(textline_mask_tot_ea, self.kernel, iterations=2),sigma, plotter=self.plotter)
+        slope_first = 0
 
         if self.plotter:
             self.plotter.save_deskewed_image(slope_deskew)
         self.logger.info("slope_deskew: %s", slope_deskew)
+        return slope_deskew, slope_first
 
-        ##plt.imshow(img_rotated)
-        ##plt.show()
-
-        self.logger.info("deskewing: " + str(time.time() - t1))
-        t1 = time.time()
-
+    def run_marginals(
+        self,
+        image_page,
+        textline_mask_tot_ea,
+        mask_images,
+        mask_lines,
+        num_col_classifier,
+        slope_deskew,
+        text_regions_p_1
+    ):
         image_page_rotated, textline_mask_tot = image_page[:, :], textline_mask_tot_ea[:, :]
         textline_mask_tot[mask_images[:, :] == 1] = 0
 
@@ -2267,7 +2247,44 @@ class eynollah:
         if self.plotter:
             self.plotter.save_plot_of_layout_main_all(text_regions_p, image_page)
             self.plotter.save_plot_of_layout_main(text_regions_p, image_page)
+        return textline_mask_tot, text_regions_p, image_page_rotated
 
+    def run(self):
+        """
+        Get image and scales, then extract the page of scanned image
+        """
+        self.logger.debug("enter run")
+        is_image_enhanced = False
+
+        t1 = time.time()
+        img_res, is_image_enhanced, num_column_is_classified = self.run_enhancement()
+        self.logger.info("Enhancing took %ss ", str(time.time() - t1))
+
+        t1 = time.time()
+        text_regions_p_1 = self.get_regions_from_xy_2models(img_res, is_image_enhanced)
+        self.logger.info("Textregion detection took %ss ", str(time.time() - t1))
+
+        t1 = time.time()
+        num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines = \
+                self.run_graphics_and_columns(text_regions_p_1, num_column_is_classified)
+        self.logger.info("Graphics detection took %ss ", str(time.time() - t1))
+
+        if not num_col:
+            self.logger.info("No columns detected, outputting an empty PAGE-XML")
+            self.write_into_page_xml([], page_coord, self.dir_out, [], [], [], [], [], [], [], [], self.curved_line, [], [])
+            self.logger.info("Job done in %ss", str(time.time() - t1))
+            return
+
+        t1 = time.time()
+        textline_mask_tot_ea, textline_mask_tot_long_shot = self.run_textline(image_page)
+        self.logger.info("textline detection took %ss", str(time.time() - t1))
+
+        t1 = time.time()
+        slope_deskew, slope_first = self.run_deskew(textline_mask_tot_ea)
+        self.logger.info("deskewing took %ss", str(time.time() - t1))
+        t1 = time.time()
+
+        textline_mask_tot, text_regions_p, image_page_rotated = self.run_marginals(image_page, textline_mask_tot_ea, mask_images, mask_lines, num_col_classifier, slope_deskew, text_regions_p_1)
         self.logger.info("detection of marginals took %ss", str(time.time() - t1))
         t1 = time.time()
 

From 420a9ca252b3b19178820fa7d2ab78d9c6547cc2 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Fri, 5 Feb 2021 18:45:14 +0100
Subject: [PATCH 39/89] more outfactoring run code to methods

---
 sbb_newspapers_org_image/eynollah.py | 272 +++++++++++++--------------
 1 file changed, 134 insertions(+), 138 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index 1723730..be22618 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -2212,16 +2212,7 @@ class eynollah:
         self.logger.info("slope_deskew: %s", slope_deskew)
         return slope_deskew, slope_first
 
-    def run_marginals(
-        self,
-        image_page,
-        textline_mask_tot_ea,
-        mask_images,
-        mask_lines,
-        num_col_classifier,
-        slope_deskew,
-        text_regions_p_1
-    ):
+    def run_marginals(self, image_page, textline_mask_tot_ea, mask_images, mask_lines, num_col_classifier, slope_deskew, text_regions_p_1):
         image_page_rotated, textline_mask_tot = image_page[:, :], textline_mask_tot_ea[:, :]
         textline_mask_tot[mask_images[:, :] == 1] = 0
 
@@ -2249,12 +2240,142 @@ class eynollah:
             self.plotter.save_plot_of_layout_main(text_regions_p, image_page)
         return textline_mask_tot, text_regions_p, image_page_rotated
 
+    def run_boxes_no_full_layout(self, image_page, textline_mask_tot, text_regions_p, slope_deskew, num_col_classifier):
+        self.logger.debug('enter run_boxes_no_full_layout')
+        if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
+            image_page_rotated_n, textline_mask_tot_d, text_regions_p_1_n = rotation_not_90_func(image_page, textline_mask_tot, text_regions_p, slope_deskew)
+            text_regions_p_1_n = resize_image(text_regions_p_1_n, text_regions_p.shape[0], text_regions_p.shape[1])
+            textline_mask_tot_d = resize_image(textline_mask_tot_d, text_regions_p.shape[0], text_regions_p.shape[1])
+            regions_without_seperators_d = (text_regions_p_1_n[:, :] == 1) * 1
+        regions_without_seperators = (text_regions_p[:, :] == 1) * 1  # ( (text_regions_p[:,:]==1) | (text_regions_p[:,:]==2) )*1 #self.return_regions_without_seperators_new(text_regions_p[:,:,0],img_only_regions)
+
+        pixel_lines = 3
+        if np.abs(slope_deskew) < SLOPE_THRESHOLD:
+            num_col, peaks_neg_fin, matrix_of_lines_ch, spliter_y_new, seperators_closeup_n = find_number_of_columns_in_document(np.repeat(text_regions_p[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines)
+
+        if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
+            num_col_d, peaks_neg_fin_d, matrix_of_lines_ch_d, spliter_y_new_d, seperators_closeup_n_d = find_number_of_columns_in_document(np.repeat(text_regions_p_1_n[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines)
+        K.clear_session()
+        gc.collect()
+
+        self.logger.info("num_col_classifier: %s", num_col_classifier)
+
+        if num_col_classifier >= 3:
+            if np.abs(slope_deskew) < SLOPE_THRESHOLD:
+                regions_without_seperators = regions_without_seperators.astype(np.uint8)
+                regions_without_seperators = cv2.erode(regions_without_seperators[:, :], self.kernel, iterations=6)
+                #random_pixels_for_image = np.random.randn(regions_without_seperators.shape[0], regions_without_seperators.shape[1])
+                #random_pixels_for_image[random_pixels_for_image < -0.5] = 0
+                #random_pixels_for_image[random_pixels_for_image != 0] = 1
+                #regions_without_seperators[(random_pixels_for_image[:, :] == 1) & (text_regions_p[:, :] == 2)] = 1
+            else:
+                regions_without_seperators_d = regions_without_seperators_d.astype(np.uint8)
+                regions_without_seperators_d = cv2.erode(regions_without_seperators_d[:, :], self.kernel, iterations=6)
+                #random_pixels_for_image = np.random.randn(regions_without_seperators_d.shape[0], regions_without_seperators_d.shape[1])
+                #random_pixels_for_image[random_pixels_for_image < -0.5] = 0
+                #random_pixels_for_image[random_pixels_for_image != 0] = 1
+
+                #regions_without_seperators_d[(random_pixels_for_image[:, :] == 1) & (text_regions_p_1_n[:, :] == 2)] = 1
+
+        t1 = time.time()
+        if np.abs(slope_deskew) < SLOPE_THRESHOLD:
+            boxes = return_boxes_of_images_by_order_of_reading_new(spliter_y_new, regions_without_seperators, matrix_of_lines_ch, num_col_classifier)
+        else:
+            boxes_d = return_boxes_of_images_by_order_of_reading_new(spliter_y_new_d, regions_without_seperators_d, matrix_of_lines_ch_d, num_col_classifier)
+        self.logger.debug("len(boxes): %s", len(boxes))
+        self.logger.info("detecting boxes took %ss", str(time.time() - t1))
+        img_revised_tab = text_regions_p[:, :]
+        polygons_of_images = return_contours_of_interested_region(img_revised_tab, 2)
+
+        # plt.imshow(img_revised_tab)
+        # plt.show()
+        K.clear_session()
+        self.logger.debug('exit run_boxes_no_full_layout')
+        return polygons_of_images, img_revised_tab, text_regions_p_1_n, textline_mask_tot_d, regions_without_seperators_d
+
+    def run_boxes_full_layout(self, image_page, textline_mask_tot, text_regions_p, slope_deskew, num_col_classifier, img_only_regions):
+        self.logger.debug('enter run_boxes_full_layout')
+        # set first model with second model
+        text_regions_p[:, :][text_regions_p[:, :] == 2] = 5
+        text_regions_p[:, :][text_regions_p[:, :] == 3] = 6
+        text_regions_p[:, :][text_regions_p[:, :] == 4] = 8
+
+        K.clear_session()
+        # gc.collect()
+        image_page = image_page.astype(np.uint8)
+
+        # print(type(image_page))
+        regions_fully, regions_fully_only_drop = self.extract_text_regions(image_page, True, cols=num_col_classifier)
+        text_regions_p[:,:][regions_fully[:,:,0]==6]=6
+
+        regions_fully_only_drop = put_drop_out_from_only_drop_model(regions_fully_only_drop, text_regions_p)
+        regions_fully[:, :, 0][regions_fully_only_drop[:, :, 0] == 4] = 4
+        K.clear_session()
+        gc.collect()
+
+        # plt.imshow(regions_fully[:,:,0])
+        # plt.show()
+
+        regions_fully = putt_bb_of_drop_capitals_of_model_in_patches_in_layout(regions_fully)
+
+        # plt.imshow(regions_fully[:,:,0])
+        # plt.show()
+
+        K.clear_session()
+        gc.collect()
+        regions_fully_np, _ = self.extract_text_regions(image_page, False, cols=num_col_classifier)
+
+        # plt.imshow(regions_fully_np[:,:,0])
+        # plt.show()
+
+        if num_col_classifier > 2:
+            regions_fully_np[:, :, 0][regions_fully_np[:, :, 0] == 4] = 0
+        else:
+            regions_fully_np = filter_small_drop_capitals_from_no_patch_layout(regions_fully_np, text_regions_p)
+
+        # plt.imshow(regions_fully_np[:,:,0])
+        # plt.show()
+
+        K.clear_session()
+        gc.collect()
+
+        # plt.imshow(regions_fully[:,:,0])
+        # plt.show()
+
+        regions_fully = boosting_headers_by_longshot_region_segmentation(regions_fully, regions_fully_np, img_only_regions)
+
+        # plt.imshow(regions_fully[:,:,0])
+        # plt.show()
+
+        text_regions_p[:, :][regions_fully[:, :, 0] == 4] = 4
+        text_regions_p[:, :][regions_fully_np[:, :, 0] == 4] = 4
+
+        #plt.imshow(text_regions_p)
+        #plt.show()
+
+        if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
+            image_page_rotated_n, textline_mask_tot_d, text_regions_p_1_n, regions_fully_n = rotation_not_90_func_full_layout(image_page, textline_mask_tot, text_regions_p, regions_fully, slope_deskew)
+
+            text_regions_p_1_n = resize_image(text_regions_p_1_n, text_regions_p.shape[0], text_regions_p.shape[1])
+            textline_mask_tot_d = resize_image(textline_mask_tot_d, text_regions_p.shape[0], text_regions_p.shape[1])
+            regions_fully_n = resize_image(regions_fully_n, text_regions_p.shape[0], text_regions_p.shape[1])
+            regions_without_seperators_d = (text_regions_p_1_n[:, :] == 1) * 1
+
+        regions_without_seperators = (text_regions_p[:, :] == 1) * 1  # ( (text_regions_p[:,:]==1) | (text_regions_p[:,:]==2) )*1 #self.return_regions_without_seperators_new(text_regions_p[:,:,0],img_only_regions)
+
+        K.clear_session()
+        gc.collect()
+        img_revised_tab = np.copy(text_regions_p[:, :])
+        pixel_img = 5
+        polygons_of_images = return_contours_of_interested_region(img_revised_tab, pixel_img)
+        self.logger.debug('exit run_boxes_full_layout')
+        return polygons_of_images, img_revised_tab, text_regions_p_1_n, textline_mask_tot_d, regions_without_seperators_d, regions_fully
+
     def run(self):
         """
         Get image and scales, then extract the page of scanned image
         """
         self.logger.debug("enter run")
-        is_image_enhanced = False
 
         t1 = time.time()
         img_res, is_image_enhanced, num_column_is_classified = self.run_enhancement()
@@ -2289,139 +2410,14 @@ class eynollah:
         t1 = time.time()
 
         if not self.full_layout:
-
-            if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
-                image_page_rotated_n, textline_mask_tot_d, text_regions_p_1_n = rotation_not_90_func(image_page, textline_mask_tot, text_regions_p, slope_deskew)
-                text_regions_p_1_n = resize_image(text_regions_p_1_n, text_regions_p.shape[0], text_regions_p.shape[1])
-                textline_mask_tot_d = resize_image(textline_mask_tot_d, text_regions_p.shape[0], text_regions_p.shape[1])
-                regions_without_seperators_d = (text_regions_p_1_n[:, :] == 1) * 1
-            regions_without_seperators = (text_regions_p[:, :] == 1) * 1  # ( (text_regions_p[:,:]==1) | (text_regions_p[:,:]==2) )*1 #self.return_regions_without_seperators_new(text_regions_p[:,:,0],img_only_regions)
-
-            pixel_lines = 3
-            if np.abs(slope_deskew) < SLOPE_THRESHOLD:
-                num_col, peaks_neg_fin, matrix_of_lines_ch, spliter_y_new, seperators_closeup_n = find_number_of_columns_in_document(np.repeat(text_regions_p[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines)
-
-            if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
-                num_col_d, peaks_neg_fin_d, matrix_of_lines_ch_d, spliter_y_new_d, seperators_closeup_n_d = find_number_of_columns_in_document(np.repeat(text_regions_p_1_n[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines)
-            K.clear_session()
-            gc.collect()
-
-            self.logger.info("num_col_classifier: %s", num_col_classifier)
-
-            if num_col_classifier >= 3:
-                if np.abs(slope_deskew) < SLOPE_THRESHOLD:
-                    regions_without_seperators = regions_without_seperators.astype(np.uint8)
-                    regions_without_seperators = cv2.erode(regions_without_seperators[:, :], self.kernel, iterations=6)
-                    #random_pixels_for_image = np.random.randn(regions_without_seperators.shape[0], regions_without_seperators.shape[1])
-                    #random_pixels_for_image[random_pixels_for_image < -0.5] = 0
-                    #random_pixels_for_image[random_pixels_for_image != 0] = 1
-                    #regions_without_seperators[(random_pixels_for_image[:, :] == 1) & (text_regions_p[:, :] == 2)] = 1
-                else:
-                    regions_without_seperators_d = regions_without_seperators_d.astype(np.uint8)
-                    regions_without_seperators_d = cv2.erode(regions_without_seperators_d[:, :], self.kernel, iterations=6)
-                    #random_pixels_for_image = np.random.randn(regions_without_seperators_d.shape[0], regions_without_seperators_d.shape[1])
-                    #random_pixels_for_image[random_pixels_for_image < -0.5] = 0
-                    #random_pixels_for_image[random_pixels_for_image != 0] = 1
-
-                    #regions_without_seperators_d[(random_pixels_for_image[:, :] == 1) & (text_regions_p_1_n[:, :] == 2)] = 1
-
-            if np.abs(slope_deskew) < SLOPE_THRESHOLD:
-                boxes = return_boxes_of_images_by_order_of_reading_new(spliter_y_new, regions_without_seperators, matrix_of_lines_ch, num_col_classifier)
-            else:
-                boxes_d = return_boxes_of_images_by_order_of_reading_new(spliter_y_new_d, regions_without_seperators_d, matrix_of_lines_ch_d, num_col_classifier)
-
-            self.logger.debug("len(boxes): %s", len(boxes))
-            self.logger.info("detecting boxes took %ss", str(time.time() - t1))
-            t1 = time.time()
-            img_revised_tab = text_regions_p[:, :]
-            pixel_img = 2
-            polygons_of_images = return_contours_of_interested_region(img_revised_tab, pixel_img)
-
-            # plt.imshow(img_revised_tab)
-            # plt.show()
-            K.clear_session()
+            polygons_of_images, img_revised_tab, text_regions_p_1_n, textline_mask_tot_d, regions_without_seperators_d = self.run_boxes_no_full_layout(image_page, textline_mask_tot, text_regions_p, slope_deskew, num_col_classifier)
 
         pixel_img = 4
         min_area_mar = 0.00001
         polygons_of_marginals = return_contours_of_interested_region(text_regions_p, pixel_img, min_area_mar)
 
         if self.full_layout:
-            # set first model with second model
-            text_regions_p[:, :][text_regions_p[:, :] == 2] = 5
-            text_regions_p[:, :][text_regions_p[:, :] == 3] = 6
-            text_regions_p[:, :][text_regions_p[:, :] == 4] = 8
-
-            K.clear_session()
-            # gc.collect()
-            image_page = image_page.astype(np.uint8)
-
-            # print(type(image_page))
-            regions_fully, regions_fully_only_drop = self.extract_text_regions(image_page, True, cols=num_col_classifier)
-            text_regions_p[:,:][regions_fully[:,:,0]==6]=6
-
-            regions_fully_only_drop = put_drop_out_from_only_drop_model(regions_fully_only_drop, text_regions_p)
-            regions_fully[:, :, 0][regions_fully_only_drop[:, :, 0] == 4] = 4
-            K.clear_session()
-            gc.collect()
-
-            # plt.imshow(regions_fully[:,:,0])
-            # plt.show()
-
-            regions_fully = putt_bb_of_drop_capitals_of_model_in_patches_in_layout(regions_fully)
-
-            # plt.imshow(regions_fully[:,:,0])
-            # plt.show()
-
-            K.clear_session()
-            gc.collect()
-            regions_fully_np, _ = self.extract_text_regions(image_page, False, cols=num_col_classifier)
-
-            # plt.imshow(regions_fully_np[:,:,0])
-            # plt.show()
-
-            if num_col_classifier > 2:
-                regions_fully_np[:, :, 0][regions_fully_np[:, :, 0] == 4] = 0
-            else:
-                regions_fully_np = filter_small_drop_capitals_from_no_patch_layout(regions_fully_np, text_regions_p)
-
-            # plt.imshow(regions_fully_np[:,:,0])
-            # plt.show()
-
-            K.clear_session()
-            gc.collect()
-
-            # plt.imshow(regions_fully[:,:,0])
-            # plt.show()
-
-            regions_fully = boosting_headers_by_longshot_region_segmentation(regions_fully, regions_fully_np, img_only_regions)
-
-            # plt.imshow(regions_fully[:,:,0])
-            # plt.show()
-
-            text_regions_p[:, :][regions_fully[:, :, 0] == 4] = 4
-            text_regions_p[:, :][regions_fully_np[:, :, 0] == 4] = 4
-
-            #plt.imshow(text_regions_p)
-            #plt.show()
-
-            if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
-                image_page_rotated_n, textline_mask_tot_d, text_regions_p_1_n, regions_fully_n = rotation_not_90_func_full_layout(image_page, textline_mask_tot, text_regions_p, regions_fully, slope_deskew)
-
-                text_regions_p_1_n = resize_image(text_regions_p_1_n, text_regions_p.shape[0], text_regions_p.shape[1])
-                textline_mask_tot_d = resize_image(textline_mask_tot_d, text_regions_p.shape[0], text_regions_p.shape[1])
-                regions_fully_n = resize_image(regions_fully_n, text_regions_p.shape[0], text_regions_p.shape[1])
-                regions_without_seperators_d = (text_regions_p_1_n[:, :] == 1) * 1
-
-            regions_without_seperators = (text_regions_p[:, :] == 1) * 1  # ( (text_regions_p[:,:]==1) | (text_regions_p[:,:]==2) )*1 #self.return_regions_without_seperators_new(text_regions_p[:,:,0],img_only_regions)
-
-            K.clear_session()
-            gc.collect()
-            img_revised_tab = np.copy(text_regions_p[:, :])
-            self.logger.info("detection of full layout took %ss", str(time.time() - t1))
-            t1 = time.time()
-            pixel_img = 5
-            polygons_of_images = return_contours_of_interested_region(img_revised_tab, pixel_img)
-
+            polygons_of_images, img_revised_tab, text_regions_p_1_n, textline_mask_tot_d, regions_without_seperators_d, regions_fully = self.run_boxes_full_layout(image_page, textline_mask_tot, text_regions_p, slope_deskew, num_col_classifier, img_only_regions)
         # plt.imshow(img_revised_tab)
         # plt.show()
 

From ef3ccd0d5f4bae25774309a50f6b114cde08c99c Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Fri, 5 Feb 2021 18:59:29 +0100
Subject: [PATCH 40/89] fix special case where num_col_classifier was unbound

---
 sbb_newspapers_org_image/eynollah.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index 0bf46ed..10b9e50 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -2154,6 +2154,7 @@ class eynollah:
         img_only_regions_with_sep = img_only_regions_with_sep.astype(np.uint8)
         img_only_regions = cv2.erode(img_only_regions_with_sep[:, :], self.kernel, iterations=6)
 
+        num_col_classifier = None
         try:
             num_col, peaks_neg_fin = find_num_col(img_only_regions, multiplier=6.0)
             if not num_column_is_classified:
@@ -2161,7 +2162,6 @@ class eynollah:
         except:
             num_col = None
             peaks_neg_fin = []
-            num_col_classifier = None
         return num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines
 
     def run_enhancement(self):

From 0a92543245aa8bef62bfaa83f17cd031d0070f38 Mon Sep 17 00:00:00 2001
From: vahidrezanezhad <vahid631983@gmail.com>
Date: Tue, 16 Feb 2021 00:31:44 +0100
Subject: [PATCH 41/89] functionality checked

---
 sbb_newspapers_org_image/eynollah.py | 53 +++++++++++++++++++---------
 1 file changed, 37 insertions(+), 16 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index 10b9e50..66eb8d8 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -2125,7 +2125,7 @@ class eynollah:
             return self.do_order_of_regions_full_layout(*args, **kwargs)
         return self.do_order_of_regions_no_full_layout(*args, **kwargs)
 
-    def run_graphics_and_columns(self, text_regions_p_1, num_column_is_classified):
+    def run_graphics_and_columns(self, text_regions_p_1, num_col_classifier, num_column_is_classified):
         img_g = cv2.imread(self.image_filename, cv2.IMREAD_GRAYSCALE)
         img_g = img_g.astype(np.uint8)
 
@@ -2154,19 +2154,20 @@ class eynollah:
         img_only_regions_with_sep = img_only_regions_with_sep.astype(np.uint8)
         img_only_regions = cv2.erode(img_only_regions_with_sep[:, :], self.kernel, iterations=6)
 
-        num_col_classifier = None
+        
         try:
             num_col, peaks_neg_fin = find_num_col(img_only_regions, multiplier=6.0)
+            
             if not num_column_is_classified:
                 num_col_classifier = num_col + 1
         except:
             num_col = None
             peaks_neg_fin = []
-        return num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines
+        return num_col+1, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, text_regions_p_1
 
     def run_enhancement(self):
         self.logger.info("resize and enhance image")
-        is_image_enhanced, img_org, img_res, _, num_column_is_classified = self.resize_and_enhance_image_with_column_classifier()
+        is_image_enhanced, img_org, img_res, num_col_classifier, num_column_is_classified = self.resize_and_enhance_image_with_column_classifier()
         self.logger.info("Image is %senhanced", '' if is_image_enhanced else 'not ')
         K.clear_session()
         scale = 1
@@ -2185,7 +2186,7 @@ class eynollah:
             if self.allow_scaling:
                 img_org, img_res, is_image_enhanced = self.resize_image_with_column_classifier(is_image_enhanced)
                 self.get_image_and_scales_after_enhancing(img_org, img_res)
-        return img_res, is_image_enhanced, num_column_is_classified
+        return img_res, is_image_enhanced, num_col_classifier, num_column_is_classified
 
     def run_textline(self, image_page):
         scaler_h_textline = 1  # 1.2#1.2
@@ -2215,7 +2216,7 @@ class eynollah:
     def run_marginals(self, image_page, textline_mask_tot_ea, mask_images, mask_lines, num_col_classifier, slope_deskew, text_regions_p_1):
         image_page_rotated, textline_mask_tot = image_page[:, :], textline_mask_tot_ea[:, :]
         textline_mask_tot[mask_images[:, :] == 1] = 0
-
+        
         pixel_img = 1
         min_area = 0.00001
         max_area = 0.0006
@@ -2228,6 +2229,8 @@ class eynollah:
             try:
                 regions_without_seperators = (text_regions_p[:, :] == 1) * 1
                 regions_without_seperators = regions_without_seperators.astype(np.uint8)
+                
+                
                 text_regions_p = get_marginals(rotate_image(regions_without_seperators, slope_deskew), text_regions_p, num_col_classifier, slope_deskew, kernel=self.kernel)
             except:
                 pass
@@ -2248,7 +2251,13 @@ class eynollah:
             textline_mask_tot_d = resize_image(textline_mask_tot_d, text_regions_p.shape[0], text_regions_p.shape[1])
             regions_without_seperators_d = (text_regions_p_1_n[:, :] == 1) * 1
         regions_without_seperators = (text_regions_p[:, :] == 1) * 1  # ( (text_regions_p[:,:]==1) | (text_regions_p[:,:]==2) )*1 #self.return_regions_without_seperators_new(text_regions_p[:,:,0],img_only_regions)
-
+        
+        
+        if np.abs(slope_deskew) < SLOPE_THRESHOLD:
+            text_regions_p_1_n = None
+            textline_mask_tot_d = None
+            regions_without_seperators_d = None
+            
         pixel_lines = 3
         if np.abs(slope_deskew) < SLOPE_THRESHOLD:
             num_col, peaks_neg_fin, matrix_of_lines_ch, spliter_y_new, seperators_closeup_n = find_number_of_columns_in_document(np.repeat(text_regions_p[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines)
@@ -2280,9 +2289,13 @@ class eynollah:
         t1 = time.time()
         if np.abs(slope_deskew) < SLOPE_THRESHOLD:
             boxes = return_boxes_of_images_by_order_of_reading_new(spliter_y_new, regions_without_seperators, matrix_of_lines_ch, num_col_classifier)
+            boxes_d = None
+            self.logger.debug("len(boxes): %s", len(boxes))
         else:
             boxes_d = return_boxes_of_images_by_order_of_reading_new(spliter_y_new_d, regions_without_seperators_d, matrix_of_lines_ch_d, num_col_classifier)
-        self.logger.debug("len(boxes): %s", len(boxes))
+            boxes = None
+            self.logger.debug("len(boxes): %s", len(boxes_d))
+        
         self.logger.info("detecting boxes took %ss", str(time.time() - t1))
         img_revised_tab = text_regions_p[:, :]
         polygons_of_images = return_contours_of_interested_region(img_revised_tab, 2)
@@ -2291,7 +2304,7 @@ class eynollah:
         # plt.show()
         K.clear_session()
         self.logger.debug('exit run_boxes_no_full_layout')
-        return polygons_of_images, img_revised_tab, text_regions_p_1_n, textline_mask_tot_d, regions_without_seperators_d
+        return polygons_of_images, img_revised_tab, text_regions_p_1_n, textline_mask_tot_d, regions_without_seperators_d, boxes, boxes_d
 
     def run_boxes_full_layout(self, image_page, textline_mask_tot, text_regions_p, slope_deskew, num_col_classifier, img_only_regions):
         self.logger.debug('enter run_boxes_full_layout')
@@ -2360,6 +2373,13 @@ class eynollah:
             textline_mask_tot_d = resize_image(textline_mask_tot_d, text_regions_p.shape[0], text_regions_p.shape[1])
             regions_fully_n = resize_image(regions_fully_n, text_regions_p.shape[0], text_regions_p.shape[1])
             regions_without_seperators_d = (text_regions_p_1_n[:, :] == 1) * 1
+            
+        else:
+            text_regions_p_1_n = None
+            textline_mask_tot_d = None
+            regions_without_seperators_d = None
+            
+            
 
         regions_without_seperators = (text_regions_p[:, :] == 1) * 1  # ( (text_regions_p[:,:]==1) | (text_regions_p[:,:]==2) )*1 #self.return_regions_without_seperators_new(text_regions_p[:,:,0],img_only_regions)
 
@@ -2369,7 +2389,7 @@ class eynollah:
         pixel_img = 5
         polygons_of_images = return_contours_of_interested_region(img_revised_tab, pixel_img)
         self.logger.debug('exit run_boxes_full_layout')
-        return polygons_of_images, img_revised_tab, text_regions_p_1_n, textline_mask_tot_d, regions_without_seperators_d, regions_fully
+        return polygons_of_images, img_revised_tab, text_regions_p_1_n, textline_mask_tot_d, regions_without_seperators_d, regions_fully, regions_without_seperators
 
     def run(self):
         """
@@ -2378,7 +2398,7 @@ class eynollah:
         self.logger.debug("enter run")
 
         t1 = time.time()
-        img_res, is_image_enhanced, num_column_is_classified = self.run_enhancement()
+        img_res, is_image_enhanced, num_col_classifier, num_column_is_classified = self.run_enhancement()
         self.logger.info("Enhancing took %ss ", str(time.time() - t1))
 
         t1 = time.time()
@@ -2386,10 +2406,11 @@ class eynollah:
         self.logger.info("Textregion detection took %ss ", str(time.time() - t1))
 
         t1 = time.time()
-        num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines = \
-                self.run_graphics_and_columns(text_regions_p_1, num_column_is_classified)
+        num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, text_regions_p_1 = \
+                self.run_graphics_and_columns(text_regions_p_1, num_col_classifier, num_column_is_classified)
         self.logger.info("Graphics detection took %ss ", str(time.time() - t1))
-
+        
+        
         if not num_col:
             self.logger.info("No columns detected, outputting an empty PAGE-XML")
             self.write_into_page_xml([], page_coord, self.dir_out, [], [], [], [], [], [], [], [], self.curved_line, [], [])
@@ -2410,14 +2431,14 @@ class eynollah:
         t1 = time.time()
 
         if not self.full_layout:
-            polygons_of_images, img_revised_tab, text_regions_p_1_n, textline_mask_tot_d, regions_without_seperators_d = self.run_boxes_no_full_layout(image_page, textline_mask_tot, text_regions_p, slope_deskew, num_col_classifier)
+            polygons_of_images, img_revised_tab, text_regions_p_1_n, textline_mask_tot_d, regions_without_seperators_d, boxes, boxes_d = self.run_boxes_no_full_layout(image_page, textline_mask_tot, text_regions_p, slope_deskew, num_col_classifier)
 
         pixel_img = 4
         min_area_mar = 0.00001
         polygons_of_marginals = return_contours_of_interested_region(text_regions_p, pixel_img, min_area_mar)
 
         if self.full_layout:
-            polygons_of_images, img_revised_tab, text_regions_p_1_n, textline_mask_tot_d, regions_without_seperators_d, regions_fully = self.run_boxes_full_layout(image_page, textline_mask_tot, text_regions_p, slope_deskew, num_col_classifier, img_only_regions)
+            polygons_of_images, img_revised_tab, text_regions_p_1_n, textline_mask_tot_d, regions_without_seperators_d, regions_fully, regions_without_seperators = self.run_boxes_full_layout(image_page, textline_mask_tot, text_regions_p, slope_deskew, num_col_classifier, img_only_regions)
         # plt.imshow(img_revised_tab)
         # plt.show()
 

From 8c603ae16d1074ec247c9956134cfc4f2b75481f Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Thu, 18 Feb 2021 14:27:08 +0100
Subject: [PATCH 42/89] check_dpi: use OcrdExif instead identify callout

---
 sbb_newspapers_org_image/eynollah.py      | 20 ++++---------------
 sbb_newspapers_org_image/utils/pil_cv2.py | 24 +++++++++++++++++++++++
 tests/test_dpi.py                         | 10 ++++++++++
 3 files changed, 38 insertions(+), 16 deletions(-)
 create mode 100644 sbb_newspapers_org_image/utils/pil_cv2.py
 create mode 100644 tests/test_dpi.py

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index 66eb8d8..a118d3b 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -12,8 +12,9 @@ import time
 import warnings
 from pathlib import Path
 from multiprocessing import Process, Queue, cpu_count
-from ocrd_utils import getLogger
 
+from lxml import etree as ET
+from ocrd_utils import getLogger
 import cv2
 import numpy as np
 
@@ -27,14 +28,6 @@ import tensorflow as tf
 tf.get_logger().setLevel("ERROR")
 warnings.filterwarnings("ignore")
 
-from scipy.signal import find_peaks
-from scipy.ndimage import gaussian_filter1d
-from shapely import geometry
-from lxml import etree as ET
-from matplotlib import pyplot, transforms
-import matplotlib.patches as mpatches
-import imutils
-
 from .utils.contour import (
     contours_in_same_horizon,
     filter_contours_area_of_image_interiors,
@@ -115,7 +108,7 @@ from .utils import (
 )
 
 from .utils.xml import create_page_xml
-
+from .utils.pil_cv2 import check_dpi
 from .plot import EynollahPlotter
 
 SLOPE_THRESHOLD = 0.13
@@ -275,11 +268,6 @@ class eynollah:
 
             return prediction_true
 
-    def check_dpi(self):
-        self.logger.debug("enter check_dpi")
-        dpi = os.popen('identify -format "%x " ' + self.image_filename).read()
-        return int(float(dpi))
-
     def calculate_width_height_by_columns(self, img, num_col, width_early, label_p_pred):
         self.logger.debug("enter calculate_width_height_by_columns")
         if num_col == 1 and width_early < 1100:
@@ -389,7 +377,7 @@ class eynollah:
 
     def resize_and_enhance_image_with_column_classifier(self):
         self.logger.debug("enter resize_and_enhance_image_with_column_classifier")
-        dpi = self.check_dpi()
+        dpi = check_dpi(self.image_filename)
         self.logger.info("Detected %s DPI" % dpi)
         img = cv2.imread(self.image_filename)
 
diff --git a/sbb_newspapers_org_image/utils/pil_cv2.py b/sbb_newspapers_org_image/utils/pil_cv2.py
new file mode 100644
index 0000000..d7cd18d
--- /dev/null
+++ b/sbb_newspapers_org_image/utils/pil_cv2.py
@@ -0,0 +1,24 @@
+from PIL import Image
+import numpy as np
+from ocrd_models import OcrdExif
+from cv2 import COLOR_GRAY2BGR, COLOR_RGB2BGR, cvtColor, imread
+
+# from sbb_binarization
+
+def cv2pil(img):
+    return Image.fromarray(img.astype('uint8'))
+
+def pil2cv(img):
+    # from ocrd/workspace.py
+    color_conversion = COLOR_GRAY2BGR if img.mode in ('1', 'L') else  COLOR_RGB2BGR
+    pil_as_np_array = np.array(img).astype('uint8') if img.mode == '1' else np.array(img)
+    return cvtColor(pil_as_np_array, color_conversion)
+
+def check_dpi(image_filename):
+    exif = OcrdExif(Image.open(image_filename))
+    print(exif.to_xml())
+    resolution = exif.resolution
+    if exif.resolutionUnit == 'cm':
+        resolution /= 2.54
+    return int(resolution)
+
diff --git a/tests/test_dpi.py b/tests/test_dpi.py
new file mode 100644
index 0000000..59c5df4
--- /dev/null
+++ b/tests/test_dpi.py
@@ -0,0 +1,10 @@
+from pathlib import Path
+from sbb_newspapers_org_image.utils.pil_cv2 import check_dpi
+from tests.base import main
+
+def test_dpi():
+    fpath = Path(__file__).parent.joinpath('resources', 'kant_aufklaerung_1784_0020.tif')
+    assert 300 == check_dpi(str(fpath))
+
+if __name__ == '__main__':
+    main(__file__)

From bce983f3d422f1f80ba49e8b04514df8c3a892ba Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Thu, 18 Feb 2021 16:28:55 +0100
Subject: [PATCH 43/89] keep images in memory and copy

---
 sbb_newspapers_org_image/eynollah.py | 48 ++++++++++++++++------------
 1 file changed, 28 insertions(+), 20 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index a118d3b..e761383 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -156,17 +156,31 @@ class eynollah:
 
         self.model_dir_of_enhancemnet = dir_models + "/model_enhancement.h5"
         self.model_dir_of_col_classifier = dir_models + "/model_scale_classifier.h5"
-        self.model_region_dir_p = dir_models + "/model_main_covid19_lr5-5_scale_1_1_great.h5"  # dir_models +'/model_main_covid_19_many_scalin_down_lr5-5_the_best.h5'#'/model_main_covid19_lr5-5_scale_1_1_great.h5'#'/model_main_scale_1_1und_1_2_corona_great.h5'
-        # self.model_region_dir_p_ens = dir_models +'/model_ensemble_s.h5'#'/model_main_covid19_lr5-5_scale_1_1_great.h5'#'/model_main_scale_1_1und_1_2_corona_great.h5'
+        self.model_region_dir_p = dir_models + "/model_main_covid19_lr5-5_scale_1_1_great.h5"
         self.model_region_dir_p2 = dir_models + "/model_main_home_corona3_rot.h5"
-
         self.model_region_dir_fully_np = dir_models + "/model_no_patches_class0_30eopch.h5"
-        self.model_region_dir_fully = dir_models + "/model_3up_new_good_no_augmentation.h5"  # "model_3col_p_soft_10_less_aug_binarization_only.h5"
-
+        self.model_region_dir_fully = dir_models + "/model_3up_new_good_no_augmentation.h5"
         self.model_page_dir = dir_models + "/model_page_mixed_best.h5"
-        self.model_region_dir_p_ens = dir_models + "/model_ensemble_s.h5"  # dir_models +'/model_main_covid_19_many_scalin_down_lr5-5_the_best.h5' #dir_models +'/model_ensemble_s.h5'
-        ###self.model_region_dir_p = dir_models +'/model_layout_newspapers.h5'#'/model_ensemble_s.h5'#'/model_layout_newspapers.h5'#'/model_ensemble_s.h5'#'/model_main_home_5_soft_new.h5'#'/model_home_soft_5_all_data.h5' #'/model_main_office_long_soft.h5'#'/model_20_cat_main.h5'
-        self.model_textline_dir = dir_models + "/model_textline_newspapers.h5"  #'/model_hor_ver_home_trextline_very_good.h5'# '/model_hor_ver_1_great.h5'#'/model_curved_office_works_great.h5'
+        self.model_region_dir_p_ens = dir_models + "/model_ensemble_s.h5"
+        self.model_textline_dir = dir_models + "/model_textline_newspapers.h5" 
+
+        self._imgs = {}
+
+    def imread(self, grayscale=False, uint8=True):
+        key = 'img'
+        if grayscale:
+            key += '_grayscale'
+        if uint8:
+            key += '_uint8'
+        if key not in self._imgs:
+            if grayscale:
+                img = cv2.imread(self.image_filename, cv2.IMREAD_GRAYSCALE)
+            else:
+                img = cv2.imread(self.image_filename)
+            if uint8:
+                img = img.astype(np.uint8)
+            self._imgs[key] = img
+        return self._imgs[key].copy()
 
     def predict_enhancement(self, img):
         self.logger.debug("enter predict_enhancement")
@@ -333,13 +347,12 @@ class eynollah:
 
     def resize_image_with_column_classifier(self, is_image_enhanced):
         self.logger.debug("enter resize_image_with_column_classifier")
-        img = cv2.imread(self.image_filename)
-        img = img.astype(np.uint8)
+        img = self.imread()
 
         _, page_coord = self.early_page_for_num_of_column_classification()
         model_num_classifier, session_col_classifier = self.start_new_session_and_model(self.model_dir_of_col_classifier)
 
-        img_1ch = cv2.imread(self.image_filename, cv.IMREAD_GRAYSCALE)
+        img_1ch = self.imread(grayscale=True, uint8=False)
         width_early = img_1ch.shape[1]
         img_1ch = img_1ch[page_coord[0] : page_coord[1], page_coord[2] : page_coord[3]]
 
@@ -379,15 +392,12 @@ class eynollah:
         self.logger.debug("enter resize_and_enhance_image_with_column_classifier")
         dpi = check_dpi(self.image_filename)
         self.logger.info("Detected %s DPI" % dpi)
-        img = cv2.imread(self.image_filename)
-
-        img = img.astype(np.uint8)
+        img = self.imread()
 
         _, page_coord = self.early_page_for_num_of_column_classification()
         model_num_classifier, session_col_classifier = self.start_new_session_and_model(self.model_dir_of_col_classifier)
 
-        img_1ch = cv2.imread(self.image_filename, cv2.IMREAD_GRAYSCALE)
-        img_1ch = img_1ch.astype(np.uint8)
+        img_1ch = self.imread(grayscale=True)
 
         width_early = img_1ch.shape[1]
 
@@ -611,8 +621,7 @@ class eynollah:
 
     def early_page_for_num_of_column_classification(self):
         self.logger.debug("enter early_page_for_num_of_column_classification")
-        img = cv2.imread(self.image_filename)
-        img = img.astype(np.uint8)
+        img = self.imread()
         model_page, session_page = self.start_new_session_and_model(self.model_page_dir)
         for ii in range(1):
             img = cv2.GaussianBlur(img, (5, 5), 0)
@@ -2114,8 +2123,7 @@ class eynollah:
         return self.do_order_of_regions_no_full_layout(*args, **kwargs)
 
     def run_graphics_and_columns(self, text_regions_p_1, num_col_classifier, num_column_is_classified):
-        img_g = cv2.imread(self.image_filename, cv2.IMREAD_GRAYSCALE)
-        img_g = img_g.astype(np.uint8)
+        img_g = self.imread(grayscale=True, uint8=True)
 
         img_g3 = np.zeros((img_g.shape[0], img_g.shape[1], 3))
         img_g3 = img_g3.astype(np.uint8)

From 7b61b64665fc7b398ad20b18988943b20e907950 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Sat, 20 Feb 2021 13:28:28 +0100
Subject: [PATCH 44/89] :art: remove dead code, spacing

---
 sbb_newspapers_org_image/eynollah.py | 74 ++++++++--------------------
 1 file changed, 20 insertions(+), 54 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index e761383..1b5dafc 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -1263,30 +1263,25 @@ class eynollah:
 
         # create the file structure
         pcgts, page = create_page_xml(self.image_filename, self.height_org, self.width_org)
-
         page_print_sub = ET.SubElement(page, "Border")
         coord_page = ET.SubElement(page_print_sub, "Coords")
         coord_page.set('points', self.calculate_page_coords())
 
-        if len(contours)>0:
-            region_order=ET.SubElement(page, 'ReadingOrder')
+        if len(contours) > 0:
+            region_order = ET.SubElement(page, 'ReadingOrder')
             region_order_sub = ET.SubElement(region_order, 'OrderedGroup')
-            
             region_order_sub.set('id',"ro357564684568544579089")
-    
-            #args_sort=order_of_texts
             for vj in order_of_texts:
-                name="coord_text_"+str(vj)
+                name = "coord_text_" + str(vj)
                 name = ET.SubElement(region_order_sub, 'RegionRefIndexed')
-                name.set('index',str(order_of_texts[vj]) )
+                name.set('index', str(order_of_texts[vj]) )
                 name.set('regionRef',id_of_texts[vj])
-                
+
             id_of_marginalia=[]
-            indexer_region=len(contours)+len(contours_h)
+            indexer_region = len(contours) + len(contours_h)
             for vm in range(len(found_polygons_marginals)):
-                id_of_marginalia.append('r'+str(indexer_region))
-                
-                name="coord_text_"+str(indexer_region)
+                id_of_marginalia.append('r' + str(indexer_region))
+                name = "coord_text_"+str(indexer_region)
                 name = ET.SubElement(region_order_sub, 'RegionRefIndexed')
                 name.set('index',str(indexer_region) )
                 name.set('regionRef','r'+str(indexer_region))
@@ -1503,7 +1498,6 @@ class eynollah:
         self.logger.debug('enter write_into_page_xml')
 
         found_polygons_text_region = contours
-        ##found_polygons_text_region_h=contours_h
 
         # create the file structure
         pcgts, page = create_page_xml(self.image_filename, self.height_org, self.width_org)
@@ -1515,18 +1509,14 @@ class eynollah:
             region_order = ET.SubElement(page, 'ReadingOrder')
             region_order_sub = ET.SubElement(region_order, 'OrderedGroup')
             region_order_sub.set('id',"ro357564684568544579089")
-
             indexer_region=0
-
-
             for vj in order_of_texts:
                 name="coord_text_"+str(vj)
                 name = ET.SubElement(region_order_sub, 'RegionRefIndexed')
-
                 name.set('index',str(indexer_region) )
                 name.set('regionRef',id_of_texts[vj])
                 indexer_region+=1
-            
+
             id_of_marginalia=[]
             for vm in range(len(found_polygons_marginals)):
                 id_of_marginalia.append('r'+str(indexer_region))
@@ -2150,16 +2140,14 @@ class eynollah:
         img_only_regions_with_sep = img_only_regions_with_sep.astype(np.uint8)
         img_only_regions = cv2.erode(img_only_regions_with_sep[:, :], self.kernel, iterations=6)
 
-        
         try:
             num_col, peaks_neg_fin = find_num_col(img_only_regions, multiplier=6.0)
-            
             if not num_column_is_classified:
                 num_col_classifier = num_col + 1
         except:
             num_col = None
             peaks_neg_fin = []
-        return num_col+1, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, text_regions_p_1
+        return num_col + 1, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, text_regions_p_1
 
     def run_enhancement(self):
         self.logger.info("resize and enhance image")
@@ -2212,7 +2200,7 @@ class eynollah:
     def run_marginals(self, image_page, textline_mask_tot_ea, mask_images, mask_lines, num_col_classifier, slope_deskew, text_regions_p_1):
         image_page_rotated, textline_mask_tot = image_page[:, :], textline_mask_tot_ea[:, :]
         textline_mask_tot[mask_images[:, :] == 1] = 0
-        
+
         pixel_img = 1
         min_area = 0.00001
         max_area = 0.0006
@@ -2225,15 +2213,10 @@ class eynollah:
             try:
                 regions_without_seperators = (text_regions_p[:, :] == 1) * 1
                 regions_without_seperators = regions_without_seperators.astype(np.uint8)
-                
-                
                 text_regions_p = get_marginals(rotate_image(regions_without_seperators, slope_deskew), text_regions_p, num_col_classifier, slope_deskew, kernel=self.kernel)
             except:
                 pass
 
-        # plt.imshow(text_regions_p)
-        # plt.show()
-
         if self.plotter:
             self.plotter.save_plot_of_layout_main_all(text_regions_p, image_page)
             self.plotter.save_plot_of_layout_main(text_regions_p, image_page)
@@ -2247,13 +2230,10 @@ class eynollah:
             textline_mask_tot_d = resize_image(textline_mask_tot_d, text_regions_p.shape[0], text_regions_p.shape[1])
             regions_without_seperators_d = (text_regions_p_1_n[:, :] == 1) * 1
         regions_without_seperators = (text_regions_p[:, :] == 1) * 1  # ( (text_regions_p[:,:]==1) | (text_regions_p[:,:]==2) )*1 #self.return_regions_without_seperators_new(text_regions_p[:,:,0],img_only_regions)
-        
-        
         if np.abs(slope_deskew) < SLOPE_THRESHOLD:
             text_regions_p_1_n = None
             textline_mask_tot_d = None
             regions_without_seperators_d = None
-            
         pixel_lines = 3
         if np.abs(slope_deskew) < SLOPE_THRESHOLD:
             num_col, peaks_neg_fin, matrix_of_lines_ch, spliter_y_new, seperators_closeup_n = find_number_of_columns_in_document(np.repeat(text_regions_p[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines)
@@ -2279,7 +2259,6 @@ class eynollah:
                 #random_pixels_for_image = np.random.randn(regions_without_seperators_d.shape[0], regions_without_seperators_d.shape[1])
                 #random_pixels_for_image[random_pixels_for_image < -0.5] = 0
                 #random_pixels_for_image[random_pixels_for_image != 0] = 1
-
                 #regions_without_seperators_d[(random_pixels_for_image[:, :] == 1) & (text_regions_p_1_n[:, :] == 2)] = 1
 
         t1 = time.time()
@@ -2291,7 +2270,7 @@ class eynollah:
             boxes_d = return_boxes_of_images_by_order_of_reading_new(spliter_y_new_d, regions_without_seperators_d, matrix_of_lines_ch_d, num_col_classifier)
             boxes = None
             self.logger.debug("len(boxes): %s", len(boxes_d))
-        
+
         self.logger.info("detecting boxes took %ss", str(time.time() - t1))
         img_revised_tab = text_regions_p[:, :]
         polygons_of_images = return_contours_of_interested_region(img_revised_tab, 2)
@@ -2369,21 +2348,17 @@ class eynollah:
             textline_mask_tot_d = resize_image(textline_mask_tot_d, text_regions_p.shape[0], text_regions_p.shape[1])
             regions_fully_n = resize_image(regions_fully_n, text_regions_p.shape[0], text_regions_p.shape[1])
             regions_without_seperators_d = (text_regions_p_1_n[:, :] == 1) * 1
-            
         else:
             text_regions_p_1_n = None
             textline_mask_tot_d = None
             regions_without_seperators_d = None
-            
-            
 
         regions_without_seperators = (text_regions_p[:, :] == 1) * 1  # ( (text_regions_p[:,:]==1) | (text_regions_p[:,:]==2) )*1 #self.return_regions_without_seperators_new(text_regions_p[:,:,0],img_only_regions)
 
         K.clear_session()
         gc.collect()
         img_revised_tab = np.copy(text_regions_p[:, :])
-        pixel_img = 5
-        polygons_of_images = return_contours_of_interested_region(img_revised_tab, pixel_img)
+        polygons_of_images = return_contours_of_interested_region(img_revised_tab, 5)
         self.logger.debug('exit run_boxes_full_layout')
         return polygons_of_images, img_revised_tab, text_regions_p_1_n, textline_mask_tot_d, regions_without_seperators_d, regions_fully, regions_without_seperators
 
@@ -2405,8 +2380,7 @@ class eynollah:
         num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, text_regions_p_1 = \
                 self.run_graphics_and_columns(text_regions_p_1, num_col_classifier, num_column_is_classified)
         self.logger.info("Graphics detection took %ss ", str(time.time() - t1))
-        
-        
+
         if not num_col:
             self.logger.info("No columns detected, outputting an empty PAGE-XML")
             self.write_into_page_xml([], page_coord, self.dir_out, [], [], [], [], [], [], [], [], self.curved_line, [], [])
@@ -2497,11 +2471,8 @@ class eynollah:
             (h, w) = text_only.shape[:2]
             center = (w // 2.0, h // 2.0)
             M = cv2.getRotationMatrix2D(center, slope_deskew, 1.0)
-
             M_22 = np.array(M)[:2, :2]
-
             p_big = np.dot(M_22, [cx_bigest_big, cy_biggest_big])
-
             x_diff = p_big[0] - cx_bigest_d_big
             y_diff = p_big[1] - cy_biggest_d_big
 
@@ -2547,9 +2518,9 @@ class eynollah:
 
             cx_bigest_big, cy_biggest_big, _, _, _, _, _ = find_new_features_of_contoures([contours_biggest])
             cx_bigest, cy_biggest, _, _, _, _, _ = find_new_features_of_contoures(contours_only_text_parent)
-            # print(areas_cnt_text_parent,'areas_cnt_text_parent')
-            # print(areas_cnt_text_parent_d,'areas_cnt_text_parent_d')
-            # print(len(contours_only_text_parent),len(contours_only_text_parent_d),'vizzz')
+            self.logger.debug('areas_cnt_text_parent %s', areas_cnt_text_parent)
+            self.logger.debug('areas_cnt_text_parent_d %s', areas_cnt_text_parent_d)
+            self.logger.debug('len(contours_only_text_parent) %s', len(contours_only_text_parent_d))
 
         txt_con_org = get_textregion_contours_in_org_image(contours_only_text_parent, self.image, slope_first)
         boxes_text, _ = get_text_region_boxes_by_given_contours(contours_only_text_parent)
@@ -2579,8 +2550,6 @@ class eynollah:
             else:
                 contours_only_text_parent_d_ordered = None
                 text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_texline_polygons, all_found_texline_polygons_h, slopes, slopes_h, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_texline_polygons, slopes, contours_only_text_parent_d_ordered)
-                
-                
 
             if self.plotter:
                 self.plotter.save_plot_of_layout(text_regions_p, image_page)
@@ -2599,9 +2568,9 @@ class eynollah:
 
             if not self.headers_off:
                 if np.abs(slope_deskew) < SLOPE_THRESHOLD:
-                    num_col, peaks_neg_fin, matrix_of_lines_ch, spliter_y_new, seperators_closeup_n = find_number_of_columns_in_document(np.repeat(text_regions_p[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines, contours_only_text_parent_h)
+                    num_col, peaks_neg_fin, matrix_of_lines_ch, spliter_y_new, _ = find_number_of_columns_in_document(np.repeat(text_regions_p[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines, contours_only_text_parent_h)
                 else:
-                    num_col_d, peaks_neg_fin_d, matrix_of_lines_ch_d, spliter_y_new_d, seperators_closeup_n_d = find_number_of_columns_in_document(np.repeat(text_regions_p_1_n[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines, contours_only_text_parent_h_d_ordered)
+                    num_col_d, peaks_neg_fin_d, matrix_of_lines_ch_d, spliter_y_new_d, _ = find_number_of_columns_in_document(np.repeat(text_regions_p_1_n[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines, contours_only_text_parent_h_d_ordered)
             elif self.headers_off:
                 if np.abs(slope_deskew) < SLOPE_THRESHOLD:
                     num_col, peaks_neg_fin, matrix_of_lines_ch, spliter_y_new, seperators_closeup_n = find_number_of_columns_in_document(np.repeat(text_regions_p[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines)
@@ -2613,7 +2582,6 @@ class eynollah:
             # print(matrix_of_lines_ch.shape,matrix_of_lines_ch_d.shape,'matrix_of_lines_ch')
 
             if num_col_classifier >= 3:
-
                 if np.abs(slope_deskew) < SLOPE_THRESHOLD:
                     regions_without_seperators = regions_without_seperators.astype(np.uint8)
                     regions_without_seperators = cv2.erode(regions_without_seperators[:, :], self.kernel, iterations=6)
@@ -2644,16 +2612,14 @@ class eynollah:
                 order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered, boxes_d, textline_mask_tot_d)
 
             self.write_into_page_xml_full(contours_only_text_parent, contours_only_text_parent_h, page_coord, self.dir_out, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, polygons_of_images, polygons_of_tabels, polygons_of_drop_capitals, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals)
+
         else:
             contours_only_text_parent_h = None
-            # self.logger.debug('bura galmir?')
             if np.abs(slope_deskew) < SLOPE_THRESHOLD:
-                #contours_only_text_parent = list(np.array(contours_only_text_parent)[index_by_text_par_con])
                 order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot)
             else:
                 contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered)[index_by_text_par_con])
                 order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h, boxes_d, textline_mask_tot_d)
-            # order_text_new , id_of_texts_tot=self.do_order_of_regions(contours_only_text_parent,contours_only_text_parent_h,boxes,textline_mask_tot)
             self.write_into_page_xml(txt_con_org, page_coord, self.dir_out, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, self.curved_line, slopes, slopes_marginals)
 
         self.logger.info("Job done in %ss", str(time.time() - t1))

From 6035740b52a2f2bbf9c62214423bf55dc2cff577 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Tue, 23 Feb 2021 13:02:43 +0100
Subject: [PATCH 45/89] num_col should default to 0 not None

---
 sbb_newspapers_org_image/eynollah.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index 1b5dafc..d60db54 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -2145,7 +2145,7 @@ class eynollah:
             if not num_column_is_classified:
                 num_col_classifier = num_col + 1
         except:
-            num_col = None
+            num_col = 0
             peaks_neg_fin = []
         return num_col + 1, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, text_regions_p_1
 
@@ -2519,8 +2519,8 @@ class eynollah:
             cx_bigest_big, cy_biggest_big, _, _, _, _, _ = find_new_features_of_contoures([contours_biggest])
             cx_bigest, cy_biggest, _, _, _, _, _ = find_new_features_of_contoures(contours_only_text_parent)
             self.logger.debug('areas_cnt_text_parent %s', areas_cnt_text_parent)
-            self.logger.debug('areas_cnt_text_parent_d %s', areas_cnt_text_parent_d)
-            self.logger.debug('len(contours_only_text_parent) %s', len(contours_only_text_parent_d))
+            # self.logger.debug('areas_cnt_text_parent_d %s', areas_cnt_text_parent_d)
+            # self.logger.debug('len(contours_only_text_parent) %s', len(contours_only_text_parent_d))
 
         txt_con_org = get_textregion_contours_in_org_image(contours_only_text_parent, self.image, slope_first)
         boxes_text, _ = get_text_region_boxes_by_given_contours(contours_only_text_parent)

From bfb6e77db1f403ec35802d3a1e42e02996e73428 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Tue, 23 Feb 2021 13:44:45 +0100
Subject: [PATCH 46/89] factor out reading order code, simplify

---
 sbb_newspapers_org_image/eynollah.py | 123 +++++++++------------------
 1 file changed, 39 insertions(+), 84 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index d60db54..48755c5 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -1267,80 +1267,35 @@ class eynollah:
         coord_page = ET.SubElement(page_print_sub, "Coords")
         coord_page.set('points', self.calculate_page_coords())
 
+        id_indexer = 0
+        id_indexer_l = 0
         if len(contours) > 0:
-            region_order = ET.SubElement(page, 'ReadingOrder')
-            region_order_sub = ET.SubElement(region_order, 'OrderedGroup')
-            region_order_sub.set('id',"ro357564684568544579089")
-            for vj in order_of_texts:
-                name = "coord_text_" + str(vj)
-                name = ET.SubElement(region_order_sub, 'RegionRefIndexed')
-                name.set('index', str(order_of_texts[vj]) )
-                name.set('regionRef',id_of_texts[vj])
-
-            id_of_marginalia=[]
-            indexer_region = len(contours) + len(contours_h)
-            for vm in range(len(found_polygons_marginals)):
-                id_of_marginalia.append('r' + str(indexer_region))
-                name = "coord_text_"+str(indexer_region)
-                name = ET.SubElement(region_order_sub, 'RegionRefIndexed')
-                name.set('index',str(indexer_region) )
-                name.set('regionRef','r'+str(indexer_region))
-                indexer_region+=1
-    
-    
-            id_indexer=0
-            id_indexer_l=0
-    
+            self.xml_reading_order(page, order_of_texts, id_of_texts, found_polygons_marginals)
             for mm in range(len(found_polygons_text_region)):
                 textregion=ET.SubElement(page, 'TextRegion')
-    
-                textregion.set('id','r'+str(id_indexer))
-                id_indexer+=1
-
-                textregion.set('type','paragraph')
+                textregion.set('id', 'r%s' % id_indexer)
+                id_indexer += 1
+                textregion.set('type', 'paragraph')
                 coord_text = ET.SubElement(textregion, 'Coords')
-
                 coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region, mm, page_coord))
-
                 id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, id_indexer_l)
-                texteqreg=ET.SubElement(textregion, 'TextEquiv')
-    
-                unireg=ET.SubElement(texteqreg, 'Unicode')
-                unireg.text = ' ' 
-                
+                texteqreg = ET.SubElement(textregion, 'TextEquiv')
+                unireg = ET.SubElement(texteqreg, 'Unicode')
+                unireg.text = ' '
 
-        #print(len(contours_h))
-        if len(contours_h)>0:
+        self.logger.debug('len(contours_h) %s', len(contours_h))
+        if len(contours_h) > 0:
             for mm in range(len(found_polygons_text_region_h)):
                 textregion=ET.SubElement(page, 'TextRegion')
-                try:
-                    id_indexer=id_indexer
-                    id_indexer_l=id_indexer_l
-                except:
-                    id_indexer=0
-                    id_indexer_l=0
-                textregion.set('id','r'+str(id_indexer))
-                id_indexer+=1
-
+                textregion.set('id', 'r%s' % id_indexer)
+                id_indexer += 1
                 textregion.set('type','header')
-                #if mm==0:
-                #    textregion.set('type','header')
-                #else:
-                #    textregion.set('type','paragraph')
                 coord_text = ET.SubElement(textregion, 'Coords')
-                
                 coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_h, mm, page_coord))
-
-                    
                 id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons_h, mm, page_coord, all_box_coord_h, slopes, id_indexer_l)
                 texteqreg=ET.SubElement(textregion, 'TextEquiv')
-    
                 unireg=ET.SubElement(texteqreg, 'Unicode')
-                unireg.text = ' ' 
-                
-
-
-
+                unireg.text = ' '
 
 
         if len(found_polygons_drop_capitals)>0:
@@ -1494,6 +1449,28 @@ class eynollah:
                 points_page_print = points_page_print + ' '
         return points_page_print
 
+    def xml_reading_order(self, page, order_of_texts, id_of_texts, found_polygons_marginals):
+        region_order = ET.SubElement(page, 'ReadingOrder')
+        region_order_sub = ET.SubElement(region_order, 'OrderedGroup')
+        region_order_sub.set('id', "ro357564684568544579089")
+        indexer_region = 0
+        for vj in order_of_texts:
+            name = "coord_text_%s" % vj
+            name = ET.SubElement(region_order_sub, 'RegionRefIndexed')
+            name.set('index', str(indexer_region))
+            name.set('regionRef', id_of_texts[vj])
+            indexer_region+=1
+        id_of_marginalia=[]
+        for vm in range(len(found_polygons_marginals)):
+            id_of_marginalia.append('r%s' % indexer_region)
+            name = "coord_text_%s" % indexer_region
+            name = ET.SubElement(region_order_sub, 'RegionRefIndexed')
+            name.set('index', str(indexer_region))
+            name.set('regionRef', 'r%s' % indexer_region)
+            indexer_region += 1
+        return id_of_marginalia
+
+
     def write_into_page_xml(self, contours, page_coord, dir_of_image, order_of_texts, id_of_texts, all_found_texline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, curved_line, slopes, slopes_marginals):
         self.logger.debug('enter write_into_page_xml')
 
@@ -1505,32 +1482,9 @@ class eynollah:
         coord_page = ET.SubElement(page_print_sub, "Coords")
         coord_page.set('points', self.calculate_page_coords())
 
+
         if len(contours) > 0:
-            region_order = ET.SubElement(page, 'ReadingOrder')
-            region_order_sub = ET.SubElement(region_order, 'OrderedGroup')
-            region_order_sub.set('id',"ro357564684568544579089")
-            indexer_region=0
-            for vj in order_of_texts:
-                name="coord_text_"+str(vj)
-                name = ET.SubElement(region_order_sub, 'RegionRefIndexed')
-                name.set('index',str(indexer_region) )
-                name.set('regionRef',id_of_texts[vj])
-                indexer_region+=1
-
-            id_of_marginalia=[]
-            for vm in range(len(found_polygons_marginals)):
-                id_of_marginalia.append('r'+str(indexer_region))
-                
-                name = "coord_text_"+str(indexer_region)
-                name = ET.SubElement(region_order_sub, 'RegionRefIndexed')
-                name.set('index',str(indexer_region) )
-                name.set('regionRef','r' + str(indexer_region))
-                indexer_region += 1
-                
-
-
-    
-    
+            id_of_marginalia = self.xml_reading_order(page, order_of_texts, id_of_texts, found_polygons_marginals)
             id_indexer = 0
             id_indexer_l = 0
     
@@ -2434,6 +2388,7 @@ class eynollah:
             contours_only_text_parent = return_parent_contours(contours_only_text, hir_on_text)
             areas_cnt_text = np.array([cv2.contourArea(contours_only_text_parent[j]) for j in range(len(contours_only_text_parent))])
             areas_cnt_text = areas_cnt_text / float(text_only.shape[0] * text_only.shape[1])
+            self.logger.info('areas_cnt_text %s', areas_cnt_text)
             contours_biggest = contours_only_text_parent[np.argmax(areas_cnt_text)]
             contours_only_text_parent = [contours_only_text_parent[jz] for jz in range(len(contours_only_text_parent)) if areas_cnt_text[jz] > min_con_area]
             areas_cnt_text_parent = [areas_cnt_text[jz] for jz in range(len(areas_cnt_text)) if areas_cnt_text[jz] > min_con_area]

From fdbcfe44a09ac75ef34488236521ec33358cb32a Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Tue, 23 Feb 2021 13:49:48 +0100
Subject: [PATCH 47/89] :art: simplify

---
 sbb_newspapers_org_image/eynollah.py | 47 ++++++++++------------------
 1 file changed, 16 insertions(+), 31 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index 48755c5..ff0d8bf 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -1269,8 +1269,10 @@ class eynollah:
 
         id_indexer = 0
         id_indexer_l = 0
+        id_of_marginalia = []
+
         if len(contours) > 0:
-            self.xml_reading_order(page, order_of_texts, id_of_texts, found_polygons_marginals)
+            self.xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia, found_polygons_marginals)
             for mm in range(len(found_polygons_text_region)):
                 textregion=ET.SubElement(page, 'TextRegion')
                 textregion.set('id', 'r%s' % id_indexer)
@@ -1293,42 +1295,23 @@ class eynollah:
                 coord_text = ET.SubElement(textregion, 'Coords')
                 coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_h, mm, page_coord))
                 id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons_h, mm, page_coord, all_box_coord_h, slopes, id_indexer_l)
-                texteqreg=ET.SubElement(textregion, 'TextEquiv')
-                unireg=ET.SubElement(texteqreg, 'Unicode')
+                texteqreg = ET.SubElement(textregion, 'TextEquiv')
+                unireg = ET.SubElement(texteqreg, 'Unicode')
                 unireg.text = ' '
 
-
-        if len(found_polygons_drop_capitals)>0:
-            id_indexer=len(contours_h)+len(contours)+len(found_polygons_marginals)
+        if len(found_polygons_drop_capitals) > 0:
+            id_indexer = len(contours_h) + len(contours) + len(found_polygons_marginals)
             for mm in range(len(found_polygons_drop_capitals)):
                 textregion=ET.SubElement(page, 'TextRegion')
-
-                
-                #id_indexer_l=id_indexer_l
-
-                textregion.set('id','r'+str(id_indexer))
-                id_indexer+=1
-
+                textregion.set('id',' r%s' % id_indexer)
+                id_indexer += 1
                 textregion.set('type','drop-capital')
-                #if mm==0:
-                #    textregion.set('type','header')
-                #else:
-                #    textregion.set('type','paragraph')
                 coord_text = ET.SubElement(textregion, 'Coords')
                 coord_text.set('points', self.calculate_polygon_coords(found_polygons_drop_capitals, mm, page_coord))
-                
-                    
-    
                 texteqreg = ET.SubElement(textregion, 'TextEquiv')
                 unireg=ET.SubElement(texteqreg, 'Unicode')
                 unireg.text = ' ' 
-                
-                
-                
-
-                
         try:
-            
             try:
                 id_indexer_l=id_indexer_l
             except:
@@ -1336,7 +1319,7 @@ class eynollah:
             for mm in range(len(found_polygons_marginals)):
                 textregion=ET.SubElement(page, 'TextRegion')
     
-                textregion.set('id',id_of_marginalia[mm])
+                textregion.set('id', id_of_marginalia[mm])
                 
                 textregion.set('type','marginalia')
                 #if mm==0:
@@ -1449,7 +1432,10 @@ class eynollah:
                 points_page_print = points_page_print + ' '
         return points_page_print
 
-    def xml_reading_order(self, page, order_of_texts, id_of_texts, found_polygons_marginals):
+    def xml_reading_order(self, page, order_of_texts, id_of_texts, id_of_marginalia, found_polygons_marginals):
+        """
+        XXX side-effect: extends id_of_marginalia
+        """
         region_order = ET.SubElement(page, 'ReadingOrder')
         region_order_sub = ET.SubElement(region_order, 'OrderedGroup')
         region_order_sub.set('id', "ro357564684568544579089")
@@ -1460,7 +1446,6 @@ class eynollah:
             name.set('index', str(indexer_region))
             name.set('regionRef', id_of_texts[vj])
             indexer_region+=1
-        id_of_marginalia=[]
         for vm in range(len(found_polygons_marginals)):
             id_of_marginalia.append('r%s' % indexer_region)
             name = "coord_text_%s" % indexer_region
@@ -1468,11 +1453,11 @@ class eynollah:
             name.set('index', str(indexer_region))
             name.set('regionRef', 'r%s' % indexer_region)
             indexer_region += 1
-        return id_of_marginalia
 
 
     def write_into_page_xml(self, contours, page_coord, dir_of_image, order_of_texts, id_of_texts, all_found_texline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, curved_line, slopes, slopes_marginals):
         self.logger.debug('enter write_into_page_xml')
+        id_of_marginalia
 
         found_polygons_text_region = contours
 
@@ -1484,7 +1469,7 @@ class eynollah:
 
 
         if len(contours) > 0:
-            id_of_marginalia = self.xml_reading_order(page, order_of_texts, id_of_texts, found_polygons_marginals)
+            self.xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia, found_polygons_marginals)
             id_indexer = 0
             id_indexer_l = 0
     

From 5f04fc71379519eb28089424d431cc068796a6e0 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Tue, 23 Feb 2021 13:52:11 +0100
Subject: [PATCH 48/89] move the two xml serialization methods next to each
 other

---
 sbb_newspapers_org_image/eynollah.py | 368 +++++++++++++--------------
 1 file changed, 184 insertions(+), 184 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index ff0d8bf..c2f4f8a 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -1255,6 +1255,190 @@ class eynollah:
         #print(coords)
         return coords
 
+    def calculate_page_coords(self):
+        self.logger.debug('enter calculate_page_coords')
+        points_page_print = ""
+        for lmm in range(len(self.cont_page[0])):
+            if len(self.cont_page[0][lmm]) == 2:
+                points_page_print += str(int((self.cont_page[0][lmm][0] ) / self.scale_x))
+                points_page_print += ','
+                points_page_print += str(int((self.cont_page[0][lmm][1] ) / self.scale_y))
+            else:
+                points_page_print += str(int((self.cont_page[0][lmm][0][0]) / self.scale_x))
+                points_page_print += ','
+                points_page_print += str(int((self.cont_page[0][lmm][0][1] ) / self.scale_y))
+
+            if lmm < (len( self.cont_page[0] ) - 1):
+                points_page_print = points_page_print + ' '
+        return points_page_print
+
+    def xml_reading_order(self, page, order_of_texts, id_of_texts, id_of_marginalia, found_polygons_marginals):
+        """
+        XXX side-effect: extends id_of_marginalia
+        """
+        region_order = ET.SubElement(page, 'ReadingOrder')
+        region_order_sub = ET.SubElement(region_order, 'OrderedGroup')
+        region_order_sub.set('id', "ro357564684568544579089")
+        indexer_region = 0
+        for vj in order_of_texts:
+            name = "coord_text_%s" % vj
+            name = ET.SubElement(region_order_sub, 'RegionRefIndexed')
+            name.set('index', str(indexer_region))
+            name.set('regionRef', id_of_texts[vj])
+            indexer_region+=1
+        for vm in range(len(found_polygons_marginals)):
+            id_of_marginalia.append('r%s' % indexer_region)
+            name = "coord_text_%s" % indexer_region
+            name = ET.SubElement(region_order_sub, 'RegionRefIndexed')
+            name.set('index', str(indexer_region))
+            name.set('regionRef', 'r%s' % indexer_region)
+            indexer_region += 1
+
+
+    def write_into_page_xml(self, contours, page_coord, dir_of_image, order_of_texts, id_of_texts, all_found_texline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, curved_line, slopes, slopes_marginals):
+        self.logger.debug('enter write_into_page_xml')
+        id_of_marginalia = []
+
+        found_polygons_text_region = contours
+
+        # create the file structure
+        pcgts, page = create_page_xml(self.image_filename, self.height_org, self.width_org)
+        page_print_sub = ET.SubElement(page, "Border")
+        coord_page = ET.SubElement(page_print_sub, "Coords")
+        coord_page.set('points', self.calculate_page_coords())
+
+
+        if len(contours) > 0:
+            self.xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia, found_polygons_marginals)
+            id_indexer = 0
+            id_indexer_l = 0
+    
+            for mm in range(len(found_polygons_text_region)):
+                textregion=ET.SubElement(page, 'TextRegion')
+                textregion.set('id', 'r'+str(id_indexer))
+                id_indexer += 1
+                textregion.set('type', 'paragraph')
+                coord_text = ET.SubElement(textregion, 'Coords')
+                coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region, mm, page_coord))
+                for j in range(len(all_found_texline_polygons[mm])):
+                    textline=ET.SubElement(textregion, 'TextLine')
+                    textline.set('id', 'l' + str(id_indexer_l))
+                    id_indexer_l += 1
+                    coord = ET.SubElement(textline, 'Coords')
+                    texteq=ET.SubElement(textline, 'TextEquiv')
+                    uni=ET.SubElement(texteq, 'Unicode')
+                    uni.text = ' ' 
+                    points_co=''
+                    for l in range(len(all_found_texline_polygons[mm][j])):
+                        #point = ET.SubElement(coord, 'Point') 
+                        if not curved_line:
+                            if len(all_found_texline_polygons[mm][j][l]) == 2:
+                                textline_x_coord = max(0, int((all_found_texline_polygons[mm][j][l][0] + all_box_coord[mm][2] + page_coord[2]) / self.scale_x))
+                                textline_y_coord = max(0, int((all_found_texline_polygons[mm][j][l][1] + all_box_coord[mm][0] + page_coord[0]) / self.scale_y))
+                                points_co += str(textline_x_coord) + ',' + str(textline_y_coord)
+                            else:
+                                textline_x_coord = max(0, int((all_found_texline_polygons[mm][j][l][0][0] + all_box_coord[mm][2]+page_coord[2]) / self.scale_x))
+                                textline_y_coord = max(0, int((all_found_texline_polygons[mm][j][l][0][1] + all_box_coord[mm][0]+page_coord[0]) / self.scale_y))
+                                points_co += str(textline_x_coord) + ',' + str(textline_y_coord)
+                        if curved_line and abs(slopes[mm]) <= 45:
+                            if len(all_found_texline_polygons[mm][j][l]) == 2:
+                                points_co += str(int((all_found_texline_polygons[mm][j][l][0] + page_coord[2]) / self.scale_x))
+                                points_co += ','
+                                points_co += str(int((all_found_texline_polygons[mm][j][l][1] + page_coord[0]) / self.scale_y))
+                            else:
+                                points_co = points_co + str(int((all_found_texline_polygons[mm][j][l][0][0] + page_coord[2]) / self.scale_x))
+                                points_co = points_co + ','
+                                points_co = points_co + str(int((all_found_texline_polygons[mm][j][l][0][1] + page_coord[0]) / self.scale_y))
+                        elif curved_line and abs(slopes[mm]) > 45:
+                            if len(all_found_texline_polygons[mm][j][l]) == 2:
+                                points_co += str(int((all_found_texline_polygons[mm][j][l][0] + all_box_coord[mm][2] + page_coord[2]) / self.scale_x))
+                                points_co += ','
+                                points_co += str(int((all_found_texline_polygons[mm][j][l][1] + all_box_coord[mm][0] + page_coord[0]) / self.scale_y))
+                            else:
+                                points_co += str(int((all_found_texline_polygons[mm][j][l][0][0] + all_box_coord[mm][2] + page_coord[2]) / self.scale_x))
+                                points_co += ','
+                                points_co += str(int((all_found_texline_polygons[mm][j][l][0][1] + all_box_coord[mm][0] + page_coord[0]) / self.scale_y))
+
+                        if l < len(all_found_texline_polygons[mm][j]) - 1:
+                            points_co += ' '
+                    coord.set('points', points_co)
+
+                texteqreg = ET.SubElement(textregion, 'TextEquiv')
+                unireg = ET.SubElement(texteqreg, 'Unicode')
+                unireg.text = ' ' 
+        try:
+            #id_indexer_l=0
+            try:
+                id_indexer_l = id_indexer_l
+            except:
+                id_indexer_l = 0
+    
+            for mm in range(len(found_polygons_marginals)):
+                textregion = ET.SubElement(page, 'TextRegion')
+                textregion.set('id', id_of_marginalia[mm])
+                textregion.set('type', 'marginalia')
+                coord_text = ET.SubElement(textregion, 'Coords')
+                coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals, mm, page_coord))
+                for j in range(len(all_found_texline_polygons_marginals[mm])):
+                    textline=ET.SubElement(textregion, 'TextLine')
+                    textline.set('id','l'+str(id_indexer_l))
+                    id_indexer_l+=1
+                    coord = ET.SubElement(textline, 'Coords')
+                    texteq = ET.SubElement(textline, 'TextEquiv')
+                    uni = ET.SubElement(texteq, 'Unicode')
+                    uni.text = ' ' 
+                    points_co=''
+                    for l in range(len(all_found_texline_polygons_marginals[mm][j])):
+                        if not curved_line:
+                            if len(all_found_texline_polygons_marginals[mm][j][l]) == 2:
+                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0] + all_box_coord_marginals[mm][2] + page_coord[2]) / self.scale_x))
+                                points_co += ','
+                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][1] + all_box_coord_marginals[mm][0] + page_coord[0]) / self.scale_y))
+                            else:
+                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0][0] + all_box_coord_marginals[mm][2] + page_coord[2]) / self.scale_x))
+                                points_co += ','
+                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0][1] + all_box_coord_marginals[mm][0] + page_coord[0])/self.scale_y)) 
+                        else:
+                            if len(all_found_texline_polygons_marginals[mm][j][l])==2:
+                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0] + page_coord[2]) / self.scale_x))
+                                points_co += ','
+                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][1] + page_coord[0]) / self.scale_y))
+                            else:
+                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0][0] + page_coord[2]) / self.scale_x))
+                                points_co += ','
+                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0][1] + page_coord[0]) / self.scale_y)) 
+                        if l < len(all_found_texline_polygons_marginals[mm][j]) - 1:
+                            points_co += ' '
+                    coord.set('points',points_co)
+        except:
+            pass
+                
+        try:
+            id_indexer=len(contours)+len(found_polygons_marginals)
+            for mm in range(len(found_polygons_text_region_img)):
+                textregion=ET.SubElement(page, 'ImageRegion')
+
+                textregion.set('id','r'+str(id_indexer))
+                id_indexer+=1
+
+
+                coord_text = ET.SubElement(textregion, 'Coords')
+                points_co=''
+                for lmm in range(len(found_polygons_text_region_img[mm])):
+                    points_co=points_co+str(int((found_polygons_text_region_img[mm][lmm,0,0] + page_coord[2]) / self.scale_x))
+                    points_co=points_co+','
+                    points_co=points_co+str(int((found_polygons_text_region_img[mm][lmm,0,1] + page_coord[0]) / self.scale_y))
+                    if lmm < len(found_polygons_text_region_img[mm]) - 1:
+                        points_co += ' '
+                coord_text.set('points', points_co)
+        except:
+            pass
+
+
+        self.logger.info("filename stem: '%s'", self.image_filename_stem)
+        tree = ET.ElementTree(pcgts)
+        tree.write(os.path.join(dir_of_image, self.image_filename_stem) + ".xml")
+
     def write_into_page_xml_full(self, contours, contours_h, page_coord, dir_of_image, order_of_texts, id_of_texts, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals):
         self.logger.debug('enter write_into_page_xml_full')
 
@@ -1415,190 +1599,6 @@ class eynollah:
         tree.write(os.path.join(dir_of_image, self.image_filename_stem) + ".xml")
     
 
-    def calculate_page_coords(self):
-        self.logger.debug('enter calculate_page_coords')
-        points_page_print = ""
-        for lmm in range(len(self.cont_page[0])):
-            if len(self.cont_page[0][lmm]) == 2:
-                points_page_print += str(int((self.cont_page[0][lmm][0] ) / self.scale_x))
-                points_page_print += ','
-                points_page_print += str(int((self.cont_page[0][lmm][1] ) / self.scale_y))
-            else:
-                points_page_print += str(int((self.cont_page[0][lmm][0][0]) / self.scale_x))
-                points_page_print += ','
-                points_page_print += str(int((self.cont_page[0][lmm][0][1] ) / self.scale_y))
-
-            if lmm < (len( self.cont_page[0] ) - 1):
-                points_page_print = points_page_print + ' '
-        return points_page_print
-
-    def xml_reading_order(self, page, order_of_texts, id_of_texts, id_of_marginalia, found_polygons_marginals):
-        """
-        XXX side-effect: extends id_of_marginalia
-        """
-        region_order = ET.SubElement(page, 'ReadingOrder')
-        region_order_sub = ET.SubElement(region_order, 'OrderedGroup')
-        region_order_sub.set('id', "ro357564684568544579089")
-        indexer_region = 0
-        for vj in order_of_texts:
-            name = "coord_text_%s" % vj
-            name = ET.SubElement(region_order_sub, 'RegionRefIndexed')
-            name.set('index', str(indexer_region))
-            name.set('regionRef', id_of_texts[vj])
-            indexer_region+=1
-        for vm in range(len(found_polygons_marginals)):
-            id_of_marginalia.append('r%s' % indexer_region)
-            name = "coord_text_%s" % indexer_region
-            name = ET.SubElement(region_order_sub, 'RegionRefIndexed')
-            name.set('index', str(indexer_region))
-            name.set('regionRef', 'r%s' % indexer_region)
-            indexer_region += 1
-
-
-    def write_into_page_xml(self, contours, page_coord, dir_of_image, order_of_texts, id_of_texts, all_found_texline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, curved_line, slopes, slopes_marginals):
-        self.logger.debug('enter write_into_page_xml')
-        id_of_marginalia
-
-        found_polygons_text_region = contours
-
-        # create the file structure
-        pcgts, page = create_page_xml(self.image_filename, self.height_org, self.width_org)
-        page_print_sub = ET.SubElement(page, "Border")
-        coord_page = ET.SubElement(page_print_sub, "Coords")
-        coord_page.set('points', self.calculate_page_coords())
-
-
-        if len(contours) > 0:
-            self.xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia, found_polygons_marginals)
-            id_indexer = 0
-            id_indexer_l = 0
-    
-            for mm in range(len(found_polygons_text_region)):
-                textregion=ET.SubElement(page, 'TextRegion')
-                textregion.set('id', 'r'+str(id_indexer))
-                id_indexer += 1
-                textregion.set('type', 'paragraph')
-                coord_text = ET.SubElement(textregion, 'Coords')
-                coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region, mm, page_coord))
-                for j in range(len(all_found_texline_polygons[mm])):
-                    textline=ET.SubElement(textregion, 'TextLine')
-                    textline.set('id', 'l' + str(id_indexer_l))
-                    id_indexer_l += 1
-                    coord = ET.SubElement(textline, 'Coords')
-                    texteq=ET.SubElement(textline, 'TextEquiv')
-                    uni=ET.SubElement(texteq, 'Unicode')
-                    uni.text = ' ' 
-                    points_co=''
-                    for l in range(len(all_found_texline_polygons[mm][j])):
-                        #point = ET.SubElement(coord, 'Point') 
-                        if not curved_line:
-                            if len(all_found_texline_polygons[mm][j][l]) == 2:
-                                textline_x_coord = max(0, int((all_found_texline_polygons[mm][j][l][0] + all_box_coord[mm][2] + page_coord[2]) / self.scale_x))
-                                textline_y_coord = max(0, int((all_found_texline_polygons[mm][j][l][1] + all_box_coord[mm][0] + page_coord[0]) / self.scale_y))
-                                points_co += str(textline_x_coord) + ',' + str(textline_y_coord)
-                            else:
-                                textline_x_coord = max(0, int((all_found_texline_polygons[mm][j][l][0][0] + all_box_coord[mm][2]+page_coord[2]) / self.scale_x))
-                                textline_y_coord = max(0, int((all_found_texline_polygons[mm][j][l][0][1] + all_box_coord[mm][0]+page_coord[0]) / self.scale_y))
-                                points_co += str(textline_x_coord) + ',' + str(textline_y_coord)
-                        if curved_line and abs(slopes[mm]) <= 45:
-                            if len(all_found_texline_polygons[mm][j][l]) == 2:
-                                points_co += str(int((all_found_texline_polygons[mm][j][l][0] + page_coord[2]) / self.scale_x))
-                                points_co += ','
-                                points_co += str(int((all_found_texline_polygons[mm][j][l][1] + page_coord[0]) / self.scale_y))
-                            else:
-                                points_co = points_co + str(int((all_found_texline_polygons[mm][j][l][0][0] + page_coord[2]) / self.scale_x))
-                                points_co = points_co + ','
-                                points_co = points_co + str(int((all_found_texline_polygons[mm][j][l][0][1] + page_coord[0]) / self.scale_y))
-                        elif curved_line and abs(slopes[mm]) > 45:
-                            if len(all_found_texline_polygons[mm][j][l]) == 2:
-                                points_co += str(int((all_found_texline_polygons[mm][j][l][0] + all_box_coord[mm][2] + page_coord[2]) / self.scale_x))
-                                points_co += ','
-                                points_co += str(int((all_found_texline_polygons[mm][j][l][1] + all_box_coord[mm][0] + page_coord[0]) / self.scale_y))
-                            else:
-                                points_co += str(int((all_found_texline_polygons[mm][j][l][0][0] + all_box_coord[mm][2] + page_coord[2]) / self.scale_x))
-                                points_co += ','
-                                points_co += str(int((all_found_texline_polygons[mm][j][l][0][1] + all_box_coord[mm][0] + page_coord[0]) / self.scale_y))
-
-                        if l < len(all_found_texline_polygons[mm][j]) - 1:
-                            points_co += ' '
-                    coord.set('points', points_co)
-
-                texteqreg = ET.SubElement(textregion, 'TextEquiv')
-                unireg = ET.SubElement(texteqreg, 'Unicode')
-                unireg.text = ' ' 
-        try:
-            #id_indexer_l=0
-            try:
-                id_indexer_l = id_indexer_l
-            except:
-                id_indexer_l = 0
-    
-            for mm in range(len(found_polygons_marginals)):
-                textregion = ET.SubElement(page, 'TextRegion')
-                textregion.set('id', id_of_marginalia[mm])
-                textregion.set('type', 'marginalia')
-                coord_text = ET.SubElement(textregion, 'Coords')
-                coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals, mm, page_coord))
-                for j in range(len(all_found_texline_polygons_marginals[mm])):
-                    textline=ET.SubElement(textregion, 'TextLine')
-                    textline.set('id','l'+str(id_indexer_l))
-                    id_indexer_l+=1
-                    coord = ET.SubElement(textline, 'Coords')
-                    texteq = ET.SubElement(textline, 'TextEquiv')
-                    uni = ET.SubElement(texteq, 'Unicode')
-                    uni.text = ' ' 
-                    points_co=''
-                    for l in range(len(all_found_texline_polygons_marginals[mm][j])):
-                        if not curved_line:
-                            if len(all_found_texline_polygons_marginals[mm][j][l]) == 2:
-                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0] + all_box_coord_marginals[mm][2] + page_coord[2]) / self.scale_x))
-                                points_co += ','
-                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][1] + all_box_coord_marginals[mm][0] + page_coord[0]) / self.scale_y))
-                            else:
-                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0][0] + all_box_coord_marginals[mm][2] + page_coord[2]) / self.scale_x))
-                                points_co += ','
-                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0][1] + all_box_coord_marginals[mm][0] + page_coord[0])/self.scale_y)) 
-                        else:
-                            if len(all_found_texline_polygons_marginals[mm][j][l])==2:
-                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0] + page_coord[2]) / self.scale_x))
-                                points_co += ','
-                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][1] + page_coord[0]) / self.scale_y))
-                            else:
-                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0][0] + page_coord[2]) / self.scale_x))
-                                points_co += ','
-                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0][1] + page_coord[0]) / self.scale_y)) 
-                        if l < len(all_found_texline_polygons_marginals[mm][j]) - 1:
-                            points_co += ' '
-                    coord.set('points',points_co)
-        except:
-            pass
-                
-        try:
-            id_indexer=len(contours)+len(found_polygons_marginals)
-            for mm in range(len(found_polygons_text_region_img)):
-                textregion=ET.SubElement(page, 'ImageRegion')
-
-                textregion.set('id','r'+str(id_indexer))
-                id_indexer+=1
-
-
-                coord_text = ET.SubElement(textregion, 'Coords')
-                points_co=''
-                for lmm in range(len(found_polygons_text_region_img[mm])):
-                    points_co=points_co+str(int((found_polygons_text_region_img[mm][lmm,0,0] + page_coord[2]) / self.scale_x))
-                    points_co=points_co+','
-                    points_co=points_co+str(int((found_polygons_text_region_img[mm][lmm,0,1] + page_coord[0]) / self.scale_y))
-                    if lmm < len(found_polygons_text_region_img[mm]) - 1:
-                        points_co += ' '
-                coord_text.set('points', points_co)
-        except:
-            pass
-
-
-        self.logger.info("filename stem: '%s'", self.image_filename_stem)
-        tree = ET.ElementTree(pcgts)
-        tree.write(os.path.join(dir_of_image, self.image_filename_stem) + ".xml")
-
     def get_regions_from_xy_2models(self,img,is_image_enhanced):
         self.logger.debug("enter get_regions_from_xy_2models")
         img_org = np.copy(img)

From 4480302b534fcc953b3dcd2ddcd6fe1bc46ac070 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Tue, 23 Feb 2021 14:07:33 +0100
Subject: [PATCH 49/89] write_into_page.*: use one variable name only

---
 sbb_newspapers_org_image/eynollah.py | 34 ++++++++++------------------
 1 file changed, 12 insertions(+), 22 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index c2f4f8a..7f218a3 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -1295,11 +1295,8 @@ class eynollah:
             indexer_region += 1
 
 
-    def write_into_page_xml(self, contours, page_coord, dir_of_image, order_of_texts, id_of_texts, all_found_texline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, curved_line, slopes, slopes_marginals):
+    def write_into_page_xml(self, found_polygons_text_region, page_coord, dir_of_image, order_of_texts, id_of_texts, all_found_texline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, curved_line, slopes, slopes_marginals):
         self.logger.debug('enter write_into_page_xml')
-        id_of_marginalia = []
-
-        found_polygons_text_region = contours
 
         # create the file structure
         pcgts, page = create_page_xml(self.image_filename, self.height_org, self.width_org)
@@ -1308,10 +1305,11 @@ class eynollah:
         coord_page.set('points', self.calculate_page_coords())
 
 
-        if len(contours) > 0:
+        id_of_marginalia = []
+        id_indexer = 0
+        id_indexer_l = 0
+        if len(found_polygons_text_region) > 0:
             self.xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia, found_polygons_marginals)
-            id_indexer = 0
-            id_indexer_l = 0
     
             for mm in range(len(found_polygons_text_region)):
                 textregion=ET.SubElement(page, 'TextRegion')
@@ -1414,7 +1412,7 @@ class eynollah:
             pass
                 
         try:
-            id_indexer=len(contours)+len(found_polygons_marginals)
+            id_indexer = len(found_polygons_text_region) + len(found_polygons_marginals)
             for mm in range(len(found_polygons_text_region_img)):
                 textregion=ET.SubElement(page, 'ImageRegion')
 
@@ -1439,12 +1437,9 @@ class eynollah:
         tree = ET.ElementTree(pcgts)
         tree.write(os.path.join(dir_of_image, self.image_filename_stem) + ".xml")
 
-    def write_into_page_xml_full(self, contours, contours_h, page_coord, dir_of_image, order_of_texts, id_of_texts, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals):
+    def write_into_page_xml_full(self, found_polygons_text_region, found_polygons_text_region_h, page_coord, dir_of_image, order_of_texts, id_of_texts, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals):
         self.logger.debug('enter write_into_page_xml_full')
 
-        found_polygons_text_region = contours
-        found_polygons_text_region_h = contours_h
-
         # create the file structure
         pcgts, page = create_page_xml(self.image_filename, self.height_org, self.width_org)
         page_print_sub = ET.SubElement(page, "Border")
@@ -1455,7 +1450,7 @@ class eynollah:
         id_indexer_l = 0
         id_of_marginalia = []
 
-        if len(contours) > 0:
+        if len(found_polygons_text_region) > 0:
             self.xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia, found_polygons_marginals)
             for mm in range(len(found_polygons_text_region)):
                 textregion=ET.SubElement(page, 'TextRegion')
@@ -1469,8 +1464,8 @@ class eynollah:
                 unireg = ET.SubElement(texteqreg, 'Unicode')
                 unireg.text = ' '
 
-        self.logger.debug('len(contours_h) %s', len(contours_h))
-        if len(contours_h) > 0:
+        self.logger.debug('len(found_polygons_text_region_h) %s', len(found_polygons_text_region_h))
+        if len(found_polygons_text_region_h) > 0:
             for mm in range(len(found_polygons_text_region_h)):
                 textregion=ET.SubElement(page, 'TextRegion')
                 textregion.set('id', 'r%s' % id_indexer)
@@ -1484,7 +1479,7 @@ class eynollah:
                 unireg.text = ' '
 
         if len(found_polygons_drop_capitals) > 0:
-            id_indexer = len(contours_h) + len(contours) + len(found_polygons_marginals)
+            id_indexer = len(found_polygons_text_region) + len(found_polygons_text_region_h) + len(found_polygons_marginals)
             for mm in range(len(found_polygons_drop_capitals)):
                 textregion=ET.SubElement(page, 'TextRegion')
                 textregion.set('id',' r%s' % id_indexer)
@@ -1496,13 +1491,8 @@ class eynollah:
                 unireg=ET.SubElement(texteqreg, 'Unicode')
                 unireg.text = ' ' 
         try:
-            try:
-                id_indexer_l=id_indexer_l
-            except:
-                id_indexer_l=0
             for mm in range(len(found_polygons_marginals)):
-                textregion=ET.SubElement(page, 'TextRegion')
-    
+                textregion = ET.SubElement(page, 'TextRegion')
                 textregion.set('id', id_of_marginalia[mm])
                 
                 textregion.set('type','marginalia')

From 881e2787ab977f2f6a6f0f2914a44a00b9957298 Mon Sep 17 00:00:00 2001
From: vahidrezanezhad <vahid631983@gmail.com>
Date: Tue, 23 Feb 2021 15:07:13 +0100
Subject: [PATCH 50/89] resolving issue with None num_col

---
 sbb_newspapers_org_image/eynollah.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index d60db54..1e466b8 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -2142,12 +2142,13 @@ class eynollah:
 
         try:
             num_col, peaks_neg_fin = find_num_col(img_only_regions, multiplier=6.0)
+            num_col = num_col + 1
             if not num_column_is_classified:
                 num_col_classifier = num_col + 1
         except:
-            num_col = 0
+            num_col = None
             peaks_neg_fin = []
-        return num_col + 1, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, text_regions_p_1
+        return num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, text_regions_p_1
 
     def run_enhancement(self):
         self.logger.info("resize and enhance image")
@@ -2381,10 +2382,11 @@ class eynollah:
                 self.run_graphics_and_columns(text_regions_p_1, num_col_classifier, num_column_is_classified)
         self.logger.info("Graphics detection took %ss ", str(time.time() - t1))
 
-        if not num_col:
+        if num_col is None:
             self.logger.info("No columns detected, outputting an empty PAGE-XML")
             self.write_into_page_xml([], page_coord, self.dir_out, [], [], [], [], [], [], [], [], self.curved_line, [], [])
             self.logger.info("Job done in %ss", str(time.time() - t1))
+            sys.exit()
             return
 
         t1 = time.time()

From 48d8406fbab90fbb20975a195a931b978c3ccf92 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Tue, 23 Feb 2021 15:10:01 +0100
Subject: [PATCH 51/89] remove unncessary try-except, formatting

---
 sbb_newspapers_org_image/eynollah.py | 132 ++++++++++-----------------
 1 file changed, 49 insertions(+), 83 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index 7f218a3..15df0af 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -1310,7 +1310,7 @@ class eynollah:
         id_indexer_l = 0
         if len(found_polygons_text_region) > 0:
             self.xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia, found_polygons_marginals)
-    
+
             for mm in range(len(found_polygons_text_region)):
                 textregion=ET.SubElement(page, 'TextRegion')
                 textregion.set('id', 'r'+str(id_indexer))
@@ -1395,42 +1395,36 @@ class eynollah:
                             else:
                                 points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0][0] + all_box_coord_marginals[mm][2] + page_coord[2]) / self.scale_x))
                                 points_co += ','
-                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0][1] + all_box_coord_marginals[mm][0] + page_coord[0])/self.scale_y)) 
+                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0][1] + all_box_coord_marginals[mm][0] + page_coord[0])/self.scale_y))
                         else:
-                            if len(all_found_texline_polygons_marginals[mm][j][l])==2:
+                            if len(all_found_texline_polygons_marginals[mm][j][l]) == 2:
                                 points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0] + page_coord[2]) / self.scale_x))
                                 points_co += ','
                                 points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][1] + page_coord[0]) / self.scale_y))
                             else:
                                 points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0][0] + page_coord[2]) / self.scale_x))
                                 points_co += ','
-                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0][1] + page_coord[0]) / self.scale_y)) 
+                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0][1] + page_coord[0]) / self.scale_y))
                         if l < len(all_found_texline_polygons_marginals[mm][j]) - 1:
                             points_co += ' '
                     coord.set('points',points_co)
         except:
             pass
-                
-        try:
-            id_indexer = len(found_polygons_text_region) + len(found_polygons_marginals)
-            for mm in range(len(found_polygons_text_region_img)):
-                textregion=ET.SubElement(page, 'ImageRegion')
 
-                textregion.set('id','r'+str(id_indexer))
-                id_indexer+=1
-
-
-                coord_text = ET.SubElement(textregion, 'Coords')
-                points_co=''
-                for lmm in range(len(found_polygons_text_region_img[mm])):
-                    points_co=points_co+str(int((found_polygons_text_region_img[mm][lmm,0,0] + page_coord[2]) / self.scale_x))
-                    points_co=points_co+','
-                    points_co=points_co+str(int((found_polygons_text_region_img[mm][lmm,0,1] + page_coord[0]) / self.scale_y))
-                    if lmm < len(found_polygons_text_region_img[mm]) - 1:
-                        points_co += ' '
-                coord_text.set('points', points_co)
-        except:
-            pass
+        id_indexer = len(found_polygons_text_region) + len(found_polygons_marginals)
+        for mm in range(len(found_polygons_text_region_img)):
+            textregion=ET.SubElement(page, 'ImageRegion')
+            textregion.set('id', 'r%s' % id_indexer)
+            id_indexer += 1
+            coord_text = ET.SubElement(textregion, 'Coords')
+            points_co = ''
+            for lmm in range(len(found_polygons_text_region_img[mm])):
+                points_co += str(int((found_polygons_text_region_img[mm][lmm,0,0] + page_coord[2]) / self.scale_x))
+                points_co += ','
+                points_co += str(int((found_polygons_text_region_img[mm][lmm,0,1] + page_coord[0]) / self.scale_y))
+                if lmm < len(found_polygons_text_region_img[mm]) - 1:
+                    points_co += ' '
+            coord_text.set('points', points_co)
 
 
         self.logger.info("filename stem: '%s'", self.image_filename_stem)
@@ -1489,105 +1483,77 @@ class eynollah:
                 coord_text.set('points', self.calculate_polygon_coords(found_polygons_drop_capitals, mm, page_coord))
                 texteqreg = ET.SubElement(textregion, 'TextEquiv')
                 unireg=ET.SubElement(texteqreg, 'Unicode')
-                unireg.text = ' ' 
+                unireg.text = ' '
         try:
             for mm in range(len(found_polygons_marginals)):
                 textregion = ET.SubElement(page, 'TextRegion')
                 textregion.set('id', id_of_marginalia[mm])
-                
-                textregion.set('type','marginalia')
-                #if mm==0:
-                #    textregion.set('type','header')
-                #else:
-                #    textregion.set('type','paragraph')
+                textregion.set('type', 'marginalia')
                 coord_text = ET.SubElement(textregion, 'Coords')
                 coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals, mm, page_coord))
 
                 for j in range(len(all_found_texline_polygons_marginals[mm])):
-                    textline=ET.SubElement(textregion, 'TextLine')
-                    textline.set('id','l'+str(id_indexer_l))
-                    id_indexer_l+=1
+                    textline = ET.SubElement(textregion, 'TextLine')
+                    textline.set('id', 'l%s' % id_indexer_l)
+                    id_indexer_l += 1
                     coord = ET.SubElement(textline, 'Coords')
-                    texteq=ET.SubElement(textline, 'TextEquiv')
-                    uni=ET.SubElement(texteq, 'Unicode')
-                    uni.text = ' ' 
+                    texteq = ET.SubElement(textline, 'TextEquiv')
+                    uni = ET.SubElement(texteq, 'Unicode')
+                    uni.text = ' '
                     points_co=''
                     for l in range(len(all_found_texline_polygons_marginals[mm][j])):
                         if not self.curved_line:
-                            if len(all_found_texline_polygons_marginals[mm][j][l])==2:
-                                points_co=points_co+str( int( (all_found_texline_polygons_marginals[mm][j][l][0]
-                                                        +all_box_coord_marginals[mm][2]+page_coord[2])/self.scale_x) )
-                                points_co=points_co+','
-                                points_co=points_co+str( int( (all_found_texline_polygons_marginals[mm][j][l][1] 
-                                                        +all_box_coord_marginals[mm][0]+page_coord[0])/self.scale_y) )
+                            if len(all_found_texline_polygons_marginals[mm][j][l]) == 2:
+                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0] + all_box_coord_marginals[mm][2] + page_coord[2]) / self.scale_x))
+                                points_co += ','
+                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][1] + all_box_coord_marginals[mm][0] + page_coord[0]) / self.scale_y))
                             else:
-                                points_co=points_co+str( int( ( all_found_texline_polygons_marginals[mm][j][l][0][0] 
-                                                        +all_box_coord_marginals[mm][2]+page_coord[2])/self.scale_x ) )
-                                points_co=points_co+','
-                                points_co=points_co+str( int( ( all_found_texline_polygons_marginals[mm][j][l][0][1] 
-                                                        +all_box_coord_marginals[mm][0]+page_coord[0])/self.scale_y) ) 
+                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0][0] + all_box_coord_marginals[mm][2] + page_coord[2]) / self.scale_x))
+                                points_co += ','
+                                points_co+= str(int((all_found_texline_polygons_marginals[mm][j][l][0][1] + all_box_coord_marginals[mm][0] + page_coord[0]) / self.scale_y))
                         else:
                             if len(all_found_texline_polygons_marginals[mm][j][l])==2:
-                                points_co=points_co+str( int( (all_found_texline_polygons_marginals[mm][j][l][0]
-                                                        +page_coord[2])/self.scale_x) )
-                                points_co=points_co+','
-                                points_co=points_co+str( int( (all_found_texline_polygons_marginals[mm][j][l][1] 
-                                                        +page_coord[0])/self.scale_y) )
+                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0] + page_coord[2]) / self.scale_x))
+                                points_co += ','
+                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][1] + page_coord[0]) / self.scale_y))
                             else:
-                                points_co=points_co+str( int( ( all_found_texline_polygons_marginals[mm][j][l][0][0] 
-                                                        +page_coord[2])/self.scale_x ) )
-                                points_co=points_co+','
-                                points_co=points_co+str( int( ( all_found_texline_polygons_marginals[mm][j][l][0][1] 
-                                                        +page_coord[0])/self.scale_y) ) 
-    
+                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0][0] + page_coord[2]) / self.scale_x))
+                                points_co += ','
+                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0][1] + page_coord[0]) / self.scale_y))
+
                         if l<(len(all_found_texline_polygons_marginals[mm][j])-1):
                             points_co=points_co+' '
-                    #print(points_co)
                     coord.set('points',points_co)
-                
-                
-                texteqreg=ET.SubElement(textregion, 'TextEquiv')
-    
-                unireg=ET.SubElement(texteqreg, 'Unicode')
+                texteqreg = ET.SubElement(textregion, 'TextEquiv')
+                unireg = ET.SubElement(texteqreg, 'Unicode')
                 unireg.text = ' ' 
         except:
             pass
-                
+
         try:
-            id_indexer=len(contours_h)+len(contours)+len(found_polygons_marginals)+len(found_polygons_drop_capitals)
+            id_indexer = len(found_polygons_text_region) + len(found_polygons_text_region_h) + len(found_polygons_marginals) + len(found_polygons_drop_capitals)
             for mm in range(len(found_polygons_text_region_img)):
                 textregion=ET.SubElement(page, 'ImageRegion')
-
-                textregion.set('id','r'+str(id_indexer))
-                id_indexer+=1
+                textregion.set('id','r%s' % id_indexer)
+                id_indexer += 1
                 coord_text = ET.SubElement(textregion, 'Coords')
                 coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_img, mm, page_coord))
         except:
             pass
 
-
         try:
             for mm in range(len(found_polygons_tables)):
-                textregion=ET.SubElement(page, 'TableRegion')
-
-                textregion.set('id','r'+str(id_indexer))
-                id_indexer+=1
+                textregion = ET.SubElement(page, 'TableRegion')
+                textregion.set('id', 'r%s' %id_indexer)
+                id_indexer += 1
                 coord_text = ET.SubElement(textregion, 'Coords')
                 coord_text.set('points', self.calculate_polygon_coords(found_polygons_tables, mm, page_coord))
         except:
             pass
 
-        ##print(dir_of_image)
-        ##print(self.f_name)
-        ##print(os.path.join(dir_of_image, self.f_name) + ".xml")
-        ##tree = ET.ElementTree(pcgts)
-        ##tree.write(os.path.join(dir_of_image, self.image_filename_stem) + ".xml")
-        
         self.logger.info("filename stem: '%s'", self.image_filename_stem)
-        # print(os.path.join(dir_of_image, self.image_filename_stem) + ".xml")
         tree = ET.ElementTree(pcgts)
         tree.write(os.path.join(dir_of_image, self.image_filename_stem) + ".xml")
-    
 
     def get_regions_from_xy_2models(self,img,is_image_enhanced):
         self.logger.debug("enter get_regions_from_xy_2models")

From b3dd6685e720b0c5387136a1408b0a85f08d8131 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Tue, 23 Feb 2021 15:39:42 +0100
Subject: [PATCH 52/89] sys.exit not necessary

---
 sbb_newspapers_org_image/eynollah.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index 8c9e26e..66405cb 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -379,7 +379,6 @@ class eynollah:
         K.clear_session()
         gc.collect()
 
-        # sys.exit()
         img_new, num_column_is_classified = self.calculate_width_height_by_columns(img, num_col, width_early, label_p_pred)
 
         if img_new.shape[1] > img.shape[1]:
@@ -1585,7 +1584,6 @@ class eynollah:
 
         #plt.imshow(prediction_regions_org_y[:,:,0])
         #plt.show()
-        #sys.exit()
         prediction_regions_org_y=prediction_regions_org_y[:,:,0]
         mask_zeros_y=(prediction_regions_org_y[:,:]==0)*1
         if is_image_enhanced:
@@ -1611,7 +1609,6 @@ class eynollah:
 
         ##plt.imshow(prediction_regions_org[:,:,0])
         ##plt.show()
-        ##sys.exit()
         prediction_regions_org=prediction_regions_org[:,:,0]
 
         prediction_regions_org[(prediction_regions_org[:,:]==1) & (mask_zeros_y[:,:]==1)]=0
@@ -1647,7 +1644,6 @@ class eynollah:
 
         #plt.imshow(prediction_regions_org2[:,:,0])
         #plt.show()
-        #sys.exit()
         ##prediction_regions_org=prediction_regions_org[:,:,0]
 
         session_region.close()
@@ -2277,11 +2273,10 @@ class eynollah:
                 self.run_graphics_and_columns(text_regions_p_1, num_col_classifier, num_column_is_classified)
         self.logger.info("Graphics detection took %ss ", str(time.time() - t1))
 
-        if num_col is None:
+        if not num_col:
             self.logger.info("No columns detected, outputting an empty PAGE-XML")
             self.write_into_page_xml([], page_coord, self.dir_out, [], [], [], [], [], [], [], [], self.curved_line, [], [])
             self.logger.info("Job done in %ss", str(time.time() - t1))
-            sys.exit()
             return
 
         t1 = time.time()

From 23e97a5e0bff7235d2121fb242663593c6eed5ad Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Tue, 23 Feb 2021 15:45:38 +0100
Subject: [PATCH 53/89] factor out add_textequiv

---
 sbb_newspapers_org_image/eynollah.py  | 43 +++++++--------------------
 sbb_newspapers_org_image/utils/xml.py |  4 +++
 2 files changed, 15 insertions(+), 32 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index 66405cb..e402e86 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -107,7 +107,7 @@ from .utils import (
     return_boxes_of_images_by_order_of_reading_new,
 )
 
-from .utils.xml import create_page_xml
+from .utils.xml import create_page_xml, add_textequiv
 from .utils.pil_cv2 import check_dpi
 from .plot import EynollahPlotter
 
@@ -1164,11 +1164,7 @@ class eynollah:
             textline.set('id','l'+str(id_indexer_l))
             id_indexer_l += 1
             coord = ET.SubElement(textline, 'Coords')
-            texteq = ET.SubElement(textline, 'TextEquiv')
-            uni = ET.SubElement(texteq, 'Unicode')
-            uni.text = ' '
-
-            #points = ET.SubElement(coord, 'Points') 
+            add_textequiv(textline)
 
             points_co=''
             for l in range(len(all_found_texline_polygons[region_idx][j])):
@@ -1303,7 +1299,6 @@ class eynollah:
         coord_page = ET.SubElement(page_print_sub, "Coords")
         coord_page.set('points', self.calculate_page_coords())
 
-
         id_of_marginalia = []
         id_indexer = 0
         id_indexer_l = 0
@@ -1322,9 +1317,7 @@ class eynollah:
                     textline.set('id', 'l' + str(id_indexer_l))
                     id_indexer_l += 1
                     coord = ET.SubElement(textline, 'Coords')
-                    texteq=ET.SubElement(textline, 'TextEquiv')
-                    uni=ET.SubElement(texteq, 'Unicode')
-                    uni.text = ' ' 
+                    add_textequiv(textline)
                     points_co=''
                     for l in range(len(all_found_texline_polygons[mm][j])):
                         #point = ET.SubElement(coord, 'Point') 
@@ -1360,9 +1353,7 @@ class eynollah:
                             points_co += ' '
                     coord.set('points', points_co)
 
-                texteqreg = ET.SubElement(textregion, 'TextEquiv')
-                unireg = ET.SubElement(texteqreg, 'Unicode')
-                unireg.text = ' ' 
+                add_textequiv(textregion)
         try:
             #id_indexer_l=0
             try:
@@ -1381,10 +1372,8 @@ class eynollah:
                     textline.set('id','l'+str(id_indexer_l))
                     id_indexer_l+=1
                     coord = ET.SubElement(textline, 'Coords')
-                    texteq = ET.SubElement(textline, 'TextEquiv')
-                    uni = ET.SubElement(texteq, 'Unicode')
-                    uni.text = ' ' 
-                    points_co=''
+                    add_textequiv(textline)
+                    points_co = ''
                     for l in range(len(all_found_texline_polygons_marginals[mm][j])):
                         if not curved_line:
                             if len(all_found_texline_polygons_marginals[mm][j][l]) == 2:
@@ -1453,9 +1442,7 @@ class eynollah:
                 coord_text = ET.SubElement(textregion, 'Coords')
                 coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region, mm, page_coord))
                 id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, id_indexer_l)
-                texteqreg = ET.SubElement(textregion, 'TextEquiv')
-                unireg = ET.SubElement(texteqreg, 'Unicode')
-                unireg.text = ' '
+                add_textequiv(textregion)
 
         self.logger.debug('len(found_polygons_text_region_h) %s', len(found_polygons_text_region_h))
         if len(found_polygons_text_region_h) > 0:
@@ -1467,9 +1454,7 @@ class eynollah:
                 coord_text = ET.SubElement(textregion, 'Coords')
                 coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_h, mm, page_coord))
                 id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons_h, mm, page_coord, all_box_coord_h, slopes, id_indexer_l)
-                texteqreg = ET.SubElement(textregion, 'TextEquiv')
-                unireg = ET.SubElement(texteqreg, 'Unicode')
-                unireg.text = ' '
+                add_textequiv(textregion)
 
         if len(found_polygons_drop_capitals) > 0:
             id_indexer = len(found_polygons_text_region) + len(found_polygons_text_region_h) + len(found_polygons_marginals)
@@ -1480,9 +1465,7 @@ class eynollah:
                 textregion.set('type','drop-capital')
                 coord_text = ET.SubElement(textregion, 'Coords')
                 coord_text.set('points', self.calculate_polygon_coords(found_polygons_drop_capitals, mm, page_coord))
-                texteqreg = ET.SubElement(textregion, 'TextEquiv')
-                unireg=ET.SubElement(texteqreg, 'Unicode')
-                unireg.text = ' '
+                add_textequiv(textregion)
         try:
             for mm in range(len(found_polygons_marginals)):
                 textregion = ET.SubElement(page, 'TextRegion')
@@ -1496,9 +1479,7 @@ class eynollah:
                     textline.set('id', 'l%s' % id_indexer_l)
                     id_indexer_l += 1
                     coord = ET.SubElement(textline, 'Coords')
-                    texteq = ET.SubElement(textline, 'TextEquiv')
-                    uni = ET.SubElement(texteq, 'Unicode')
-                    uni.text = ' '
+                    add_textequiv(textline)
                     points_co=''
                     for l in range(len(all_found_texline_polygons_marginals[mm][j])):
                         if not self.curved_line:
@@ -1523,9 +1504,7 @@ class eynollah:
                         if l<(len(all_found_texline_polygons_marginals[mm][j])-1):
                             points_co=points_co+' '
                     coord.set('points',points_co)
-                texteqreg = ET.SubElement(textregion, 'TextEquiv')
-                unireg = ET.SubElement(texteqreg, 'Unicode')
-                unireg.text = ' ' 
+                add_textequiv(textregion)
         except:
             pass
 
diff --git a/sbb_newspapers_org_image/utils/xml.py b/sbb_newspapers_org_image/utils/xml.py
index 0eb10ec..072bca5 100644
--- a/sbb_newspapers_org_image/utils/xml.py
+++ b/sbb_newspapers_org_image/utils/xml.py
@@ -32,3 +32,7 @@ def create_page_xml(imageFilename, height, width):
 
     return pcgts, page
 
+def add_textequiv(parent, text=''):
+    textequiv = ET.SubElement(parent, 'TextEquiv')
+    unireg = ET.SubElement(textequiv, 'Unicode')
+    unireg.text = text

From 6d476230ce778154dcbc6e31a243ab7a7346e260 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Tue, 23 Feb 2021 16:00:51 +0100
Subject: [PATCH 54/89] remove unnecessary (hope so) try-except

---
 sbb_newspapers_org_image/eynollah.py | 196 ++++++++++++---------------
 1 file changed, 89 insertions(+), 107 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index e402e86..f7cf193 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -1306,21 +1306,20 @@ class eynollah:
             self.xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia, found_polygons_marginals)
 
             for mm in range(len(found_polygons_text_region)):
-                textregion=ET.SubElement(page, 'TextRegion')
-                textregion.set('id', 'r'+str(id_indexer))
+                textregion = ET.SubElement(page, 'TextRegion')
+                textregion.set('id', 'r%s' % id_indexer)
                 id_indexer += 1
                 textregion.set('type', 'paragraph')
                 coord_text = ET.SubElement(textregion, 'Coords')
                 coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region, mm, page_coord))
                 for j in range(len(all_found_texline_polygons[mm])):
-                    textline=ET.SubElement(textregion, 'TextLine')
-                    textline.set('id', 'l' + str(id_indexer_l))
+                    textline = ET.SubElement(textregion, 'TextLine')
+                    textline.set('id', 'l%s'  % id_indexer_l)
                     id_indexer_l += 1
                     coord = ET.SubElement(textline, 'Coords')
                     add_textequiv(textline)
                     points_co=''
                     for l in range(len(all_found_texline_polygons[mm][j])):
-                        #point = ET.SubElement(coord, 'Point') 
                         if not curved_line:
                             if len(all_found_texline_polygons[mm][j][l]) == 2:
                                 textline_x_coord = max(0, int((all_found_texline_polygons[mm][j][l][0] + all_box_coord[mm][2] + page_coord[2]) / self.scale_x))
@@ -1354,50 +1353,42 @@ class eynollah:
                     coord.set('points', points_co)
 
                 add_textequiv(textregion)
-        try:
-            #id_indexer_l=0
-            try:
-                id_indexer_l = id_indexer_l
-            except:
-                id_indexer_l = 0
-    
-            for mm in range(len(found_polygons_marginals)):
-                textregion = ET.SubElement(page, 'TextRegion')
-                textregion.set('id', id_of_marginalia[mm])
-                textregion.set('type', 'marginalia')
-                coord_text = ET.SubElement(textregion, 'Coords')
-                coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals, mm, page_coord))
-                for j in range(len(all_found_texline_polygons_marginals[mm])):
-                    textline=ET.SubElement(textregion, 'TextLine')
-                    textline.set('id','l'+str(id_indexer_l))
-                    id_indexer_l+=1
-                    coord = ET.SubElement(textline, 'Coords')
-                    add_textequiv(textline)
-                    points_co = ''
-                    for l in range(len(all_found_texline_polygons_marginals[mm][j])):
-                        if not curved_line:
-                            if len(all_found_texline_polygons_marginals[mm][j][l]) == 2:
-                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0] + all_box_coord_marginals[mm][2] + page_coord[2]) / self.scale_x))
-                                points_co += ','
-                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][1] + all_box_coord_marginals[mm][0] + page_coord[0]) / self.scale_y))
-                            else:
-                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0][0] + all_box_coord_marginals[mm][2] + page_coord[2]) / self.scale_x))
-                                points_co += ','
-                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0][1] + all_box_coord_marginals[mm][0] + page_coord[0])/self.scale_y))
+
+        for mm in range(len(found_polygons_marginals)):
+            textregion = ET.SubElement(page, 'TextRegion')
+            textregion.set('id', id_of_marginalia[mm])
+            textregion.set('type', 'marginalia')
+            coord_text = ET.SubElement(textregion, 'Coords')
+            coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals, mm, page_coord))
+            for j in range(len(all_found_texline_polygons_marginals[mm])):
+                textline = ET.SubElement(textregion, 'TextLine')
+                textline.set('id','l'+str(id_indexer_l))
+                id_indexer_l += 1
+                coord = ET.SubElement(textline, 'Coords')
+                add_textequiv(textline)
+                points_co = ''
+                for l in range(len(all_found_texline_polygons_marginals[mm][j])):
+                    if not curved_line:
+                        if len(all_found_texline_polygons_marginals[mm][j][l]) == 2:
+                            points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0] + all_box_coord_marginals[mm][2] + page_coord[2]) / self.scale_x))
+                            points_co += ','
+                            points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][1] + all_box_coord_marginals[mm][0] + page_coord[0]) / self.scale_y))
                         else:
-                            if len(all_found_texline_polygons_marginals[mm][j][l]) == 2:
-                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0] + page_coord[2]) / self.scale_x))
-                                points_co += ','
-                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][1] + page_coord[0]) / self.scale_y))
-                            else:
-                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0][0] + page_coord[2]) / self.scale_x))
-                                points_co += ','
-                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0][1] + page_coord[0]) / self.scale_y))
-                        if l < len(all_found_texline_polygons_marginals[mm][j]) - 1:
-                            points_co += ' '
-                    coord.set('points',points_co)
-        except:
-            pass
+                            points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0][0] + all_box_coord_marginals[mm][2] + page_coord[2]) / self.scale_x))
+                            points_co += ','
+                            points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0][1] + all_box_coord_marginals[mm][0] + page_coord[0])/self.scale_y))
+                    else:
+                        if len(all_found_texline_polygons_marginals[mm][j][l]) == 2:
+                            points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0] + page_coord[2]) / self.scale_x))
+                            points_co += ','
+                            points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][1] + page_coord[0]) / self.scale_y))
+                        else:
+                            points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0][0] + page_coord[2]) / self.scale_x))
+                            points_co += ','
+                            points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0][1] + page_coord[0]) / self.scale_y))
+                    if l < len(all_found_texline_polygons_marginals[mm][j]) - 1:
+                        points_co += ' '
+                coord.set('points',points_co)
 
         id_indexer = len(found_polygons_text_region) + len(found_polygons_marginals)
         for mm in range(len(found_polygons_text_region_img)):
@@ -1414,7 +1405,6 @@ class eynollah:
                     points_co += ' '
             coord_text.set('points', points_co)
 
-
         self.logger.info("filename stem: '%s'", self.image_filename_stem)
         tree = ET.ElementTree(pcgts)
         tree.write(os.path.join(dir_of_image, self.image_filename_stem) + ".xml")
@@ -1462,72 +1452,64 @@ class eynollah:
                 textregion=ET.SubElement(page, 'TextRegion')
                 textregion.set('id',' r%s' % id_indexer)
                 id_indexer += 1
-                textregion.set('type','drop-capital')
+                textregion.set('type', 'drop-capital')
                 coord_text = ET.SubElement(textregion, 'Coords')
                 coord_text.set('points', self.calculate_polygon_coords(found_polygons_drop_capitals, mm, page_coord))
                 add_textequiv(textregion)
-        try:
-            for mm in range(len(found_polygons_marginals)):
-                textregion = ET.SubElement(page, 'TextRegion')
-                textregion.set('id', id_of_marginalia[mm])
-                textregion.set('type', 'marginalia')
-                coord_text = ET.SubElement(textregion, 'Coords')
-                coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals, mm, page_coord))
 
-                for j in range(len(all_found_texline_polygons_marginals[mm])):
-                    textline = ET.SubElement(textregion, 'TextLine')
-                    textline.set('id', 'l%s' % id_indexer_l)
-                    id_indexer_l += 1
-                    coord = ET.SubElement(textline, 'Coords')
-                    add_textequiv(textline)
-                    points_co=''
-                    for l in range(len(all_found_texline_polygons_marginals[mm][j])):
-                        if not self.curved_line:
-                            if len(all_found_texline_polygons_marginals[mm][j][l]) == 2:
-                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0] + all_box_coord_marginals[mm][2] + page_coord[2]) / self.scale_x))
-                                points_co += ','
-                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][1] + all_box_coord_marginals[mm][0] + page_coord[0]) / self.scale_y))
-                            else:
-                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0][0] + all_box_coord_marginals[mm][2] + page_coord[2]) / self.scale_x))
-                                points_co += ','
-                                points_co+= str(int((all_found_texline_polygons_marginals[mm][j][l][0][1] + all_box_coord_marginals[mm][0] + page_coord[0]) / self.scale_y))
+        for mm in range(len(found_polygons_marginals)):
+            textregion = ET.SubElement(page, 'TextRegion')
+            textregion.set('id', id_of_marginalia[mm])
+            textregion.set('type', 'marginalia')
+            coord_text = ET.SubElement(textregion, 'Coords')
+            coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals, mm, page_coord))
+
+            for j in range(len(all_found_texline_polygons_marginals[mm])):
+                textline = ET.SubElement(textregion, 'TextLine')
+                textline.set('id', 'l%s' % id_indexer_l)
+                id_indexer_l += 1
+                coord = ET.SubElement(textline, 'Coords')
+                add_textequiv(textline)
+                points_co=''
+                for l in range(len(all_found_texline_polygons_marginals[mm][j])):
+                    if not self.curved_line:
+                        if len(all_found_texline_polygons_marginals[mm][j][l]) == 2:
+                            points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0] + all_box_coord_marginals[mm][2] + page_coord[2]) / self.scale_x))
+                            points_co += ','
+                            points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][1] + all_box_coord_marginals[mm][0] + page_coord[0]) / self.scale_y))
                         else:
-                            if len(all_found_texline_polygons_marginals[mm][j][l])==2:
-                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0] + page_coord[2]) / self.scale_x))
-                                points_co += ','
-                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][1] + page_coord[0]) / self.scale_y))
-                            else:
-                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0][0] + page_coord[2]) / self.scale_x))
-                                points_co += ','
-                                points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0][1] + page_coord[0]) / self.scale_y))
+                            points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0][0] + all_box_coord_marginals[mm][2] + page_coord[2]) / self.scale_x))
+                            points_co += ','
+                            points_co+= str(int((all_found_texline_polygons_marginals[mm][j][l][0][1] + all_box_coord_marginals[mm][0] + page_coord[0]) / self.scale_y))
+                    else:
+                        if len(all_found_texline_polygons_marginals[mm][j][l])==2:
+                            points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0] + page_coord[2]) / self.scale_x))
+                            points_co += ','
+                            points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][1] + page_coord[0]) / self.scale_y))
+                        else:
+                            points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0][0] + page_coord[2]) / self.scale_x))
+                            points_co += ','
+                            points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0][1] + page_coord[0]) / self.scale_y))
 
-                        if l<(len(all_found_texline_polygons_marginals[mm][j])-1):
-                            points_co=points_co+' '
-                    coord.set('points',points_co)
-                add_textequiv(textregion)
-        except:
-            pass
+                    if l<(len(all_found_texline_polygons_marginals[mm][j])-1):
+                        points_co=points_co+' '
+                coord.set('points',points_co)
+            add_textequiv(textregion)
 
-        try:
-            id_indexer = len(found_polygons_text_region) + len(found_polygons_text_region_h) + len(found_polygons_marginals) + len(found_polygons_drop_capitals)
-            for mm in range(len(found_polygons_text_region_img)):
-                textregion=ET.SubElement(page, 'ImageRegion')
-                textregion.set('id','r%s' % id_indexer)
-                id_indexer += 1
-                coord_text = ET.SubElement(textregion, 'Coords')
-                coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_img, mm, page_coord))
-        except:
-            pass
+        id_indexer = len(found_polygons_text_region) + len(found_polygons_text_region_h) + len(found_polygons_marginals) + len(found_polygons_drop_capitals)
+        for mm in range(len(found_polygons_text_region_img)):
+            textregion=ET.SubElement(page, 'ImageRegion')
+            textregion.set('id', 'r%s' % id_indexer)
+            id_indexer += 1
+            coord_text = ET.SubElement(textregion, 'Coords')
+            coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_img, mm, page_coord))
 
-        try:
-            for mm in range(len(found_polygons_tables)):
-                textregion = ET.SubElement(page, 'TableRegion')
-                textregion.set('id', 'r%s' %id_indexer)
-                id_indexer += 1
-                coord_text = ET.SubElement(textregion, 'Coords')
-                coord_text.set('points', self.calculate_polygon_coords(found_polygons_tables, mm, page_coord))
-        except:
-            pass
+        for mm in range(len(found_polygons_tables)):
+            textregion = ET.SubElement(page, 'TableRegion')
+            textregion.set('id', 'r%s' %id_indexer)
+            id_indexer += 1
+            coord_text = ET.SubElement(textregion, 'Coords')
+            coord_text.set('points', self.calculate_polygon_coords(found_polygons_tables, mm, page_coord))
 
         self.logger.info("filename stem: '%s'", self.image_filename_stem)
         tree = ET.ElementTree(pcgts)

From f96a9c52d16e37a098915cd88ef8f0e4daa255f7 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Tue, 23 Feb 2021 16:10:09 +0100
Subject: [PATCH 55/89] uniform coords calculation

---
 sbb_newspapers_org_image/eynollah.py | 104 ++++++++++-----------------
 1 file changed, 37 insertions(+), 67 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index f7cf193..797de1b 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -1160,75 +1160,46 @@ class eynollah:
     def serialize_lines_in_region(self, textregion, all_found_texline_polygons, region_idx, page_coord, all_box_coord, slopes, id_indexer_l):
         self.logger.debug('enter serialize_lines_in_region')
         for j in range(len(all_found_texline_polygons[region_idx])):
-            textline=ET.SubElement(textregion, 'TextLine')
-            textline.set('id','l'+str(id_indexer_l))
+            textline = ET.SubElement(textregion, 'TextLine')
+            textline.set('id', 'l%s' % id_indexer_l)
             id_indexer_l += 1
             coord = ET.SubElement(textline, 'Coords')
             add_textequiv(textline)
 
-            points_co=''
+            points_co = ''
             for l in range(len(all_found_texline_polygons[region_idx][j])):
                 if not self.curved_line:
-                    #point.set('x',str(found_polygons[j][l][0]))  
-                    #point.set('y',str(found_polygons[j][l][1]))
                     if len(all_found_texline_polygons[region_idx][j][l])==2:
-                        textline_x_coord=int( (all_found_texline_polygons[region_idx][j][l][0]
-                                                +all_box_coord[region_idx][2]+page_coord[2])/self.scale_x)
-                        textline_y_coord=int( (all_found_texline_polygons[region_idx][j][l][1] 
-                                                +all_box_coord[region_idx][0]+page_coord[0])/self.scale_y)
-                        
-                        if textline_x_coord<0:
-                            textline_x_coord=0
-                        if textline_y_coord<0:
-                            textline_y_coord=0
-                        points_co=points_co+str( textline_x_coord )
-                        points_co=points_co+','
-                        points_co=points_co+str( textline_y_coord )
+                        textline_x_coord = max(0, int((all_found_texline_polygons[region_idx][j][l][0] + all_box_coord[region_idx][2] + page_coord[2]) / self.scale_x))
+                        textline_y_coord = max(0, int((all_found_texline_polygons[region_idx][j][l][1] + all_box_coord[region_idx][0] + page_coord[0]) / self.scale_y))
                     else:
-                        
-                        textline_x_coord=int( ( all_found_texline_polygons[region_idx][j][l][0][0] 
-                                                +all_box_coord[region_idx][2]+page_coord[2])/self.scale_x )
-                        textline_y_coord=int( ( all_found_texline_polygons[region_idx][j][l][0][1] 
-                                                +all_box_coord[region_idx][0]+page_coord[0])/self.scale_y)
-                        
-                        if textline_x_coord<0:
-                            textline_x_coord=0
-                        if textline_y_coord<0:
-                            textline_y_coord=0
-                            
-                        points_co=points_co+str( textline_x_coord )
-                        points_co=points_co+','
-                        points_co=points_co+str( textline_y_coord ) 
-                                        
-                if (self.curved_line) and np.abs(slopes[region_idx]) <= 45 :
-                    if len(all_found_texline_polygons[region_idx][j][l])==2:
-                        points_co=points_co+str( int( (all_found_texline_polygons[region_idx][j][l][0]
-                                                +page_coord[2])/self.scale_x) )
-                        points_co=points_co+','
-                        points_co=points_co+str( int( (all_found_texline_polygons[region_idx][j][l][1] 
-                                                +page_coord[0])/self.scale_y) )
-                    else:
-                        points_co=points_co+str( int( ( all_found_texline_polygons[region_idx][j][l][0][0] 
-                                                +page_coord[2])/self.scale_x ) )
-                        points_co=points_co+','
-                        points_co=points_co+str( int( ( all_found_texline_polygons[region_idx][j][l][0][1] 
-                                                +page_coord[0])/self.scale_y) )
-                elif (self.curved_line) and np.abs(slopes[region_idx]) > 45 :
-                    if len(all_found_texline_polygons[region_idx][j][l])==2:
-                        points_co=points_co+str( int( (all_found_texline_polygons[region_idx][j][l][0]
-                                                +all_box_coord[region_idx][2]+page_coord[2])/self.scale_x) )
-                        points_co=points_co+','
-                        points_co=points_co+str( int( (all_found_texline_polygons[region_idx][j][l][1] 
-                                                +all_box_coord[region_idx][0]+page_coord[0])/self.scale_y) )
-                    else:
-                        points_co=points_co+str( int( ( all_found_texline_polygons[region_idx][j][l][0][0] 
-                                                +all_box_coord[region_idx][2]+page_coord[2])/self.scale_x ) )
-                        points_co=points_co+','
-                        points_co=points_co+str( int( ( all_found_texline_polygons[region_idx][j][l][0][1] 
-                                                +all_box_coord[region_idx][0]+page_coord[0])/self.scale_y) )
+                        textline_x_coord = max(0, int((all_found_texline_polygons[region_idx][j][l][0][0] + all_box_coord[region_idx][2] + page_coord[2]) / self.scale_x))
+                        textline_y_coord = max(0, int((all_found_texline_polygons[region_idx][j][l][0][1] + all_box_coord[region_idx][0] + page_coord[0]) / self.scale_y))
+                    points_co += str(textline_x_coord)
+                    points_co += ','
+                    points_co += str(textline_y_coord)
 
-                if l<(len(all_found_texline_polygons[region_idx][j])-1):
-                    points_co=points_co+' '
+                if self.curved_line and np.abs(slopes[region_idx]) <= 45:
+                    if len(all_found_texline_polygons[region_idx][j][l]) == 2:
+                        points_co += str(int((all_found_texline_polygons[region_idx][j][l][0] + page_coord[2]) / self.scale_x))
+                        points_co += ','
+                        points_co += str(int((all_found_texline_polygons[region_idx][j][l][1] + page_coord[0]) / self.scale_y))
+                    else:
+                        points_co += str(int((all_found_texline_polygons[region_idx][j][l][0][0] + page_coord[2]) / self.scale_x))
+                        points_co += ','
+                        points_co += str(int((all_found_texline_polygons[region_idx][j][l][0][1] + page_coord[0])/self.scale_y))
+                elif self.curved_line and np.abs(slopes[region_idx]) > 45:
+                    if len(all_found_texline_polygons[region_idx][j][l])==2:
+                        points_co += str(int((all_found_texline_polygons[region_idx][j][l][0] + all_box_coord[region_idx][2]+page_coord[2])/self.scale_x))
+                        points_co += ','
+                        points_co += str(int((all_found_texline_polygons[region_idx][j][l][1] + all_box_coord[region_idx][0]+page_coord[0])/self.scale_y))
+                    else:
+                        points_co += str(int((all_found_texline_polygons[region_idx][j][l][0][0] + all_box_coord[region_idx][2]+page_coord[2])/self.scale_x))
+                        points_co += ','
+                        points_co += str(int((all_found_texline_polygons[region_idx][j][l][0][1] + all_box_coord[region_idx][0]+page_coord[0])/self.scale_y))
+
+                if l < len(all_found_texline_polygons[region_idx][j]) - 1:
+                    points_co += ' '
             coord.set('points',points_co)
         return id_indexer_l
 
@@ -1263,7 +1234,7 @@ class eynollah:
                 points_page_print += ','
                 points_page_print += str(int((self.cont_page[0][lmm][0][1] ) / self.scale_y))
 
-            if lmm < (len( self.cont_page[0] ) - 1):
+            if lmm < len( self.cont_page[0] ) - 1:
                 points_page_print = points_page_print + ' '
         return points_page_print
 
@@ -1318,17 +1289,16 @@ class eynollah:
                     id_indexer_l += 1
                     coord = ET.SubElement(textline, 'Coords')
                     add_textequiv(textline)
-                    points_co=''
+                    points_co = ''
                     for l in range(len(all_found_texline_polygons[mm][j])):
                         if not curved_line:
                             if len(all_found_texline_polygons[mm][j][l]) == 2:
                                 textline_x_coord = max(0, int((all_found_texline_polygons[mm][j][l][0] + all_box_coord[mm][2] + page_coord[2]) / self.scale_x))
                                 textline_y_coord = max(0, int((all_found_texline_polygons[mm][j][l][1] + all_box_coord[mm][0] + page_coord[0]) / self.scale_y))
-                                points_co += str(textline_x_coord) + ',' + str(textline_y_coord)
                             else:
                                 textline_x_coord = max(0, int((all_found_texline_polygons[mm][j][l][0][0] + all_box_coord[mm][2]+page_coord[2]) / self.scale_x))
                                 textline_y_coord = max(0, int((all_found_texline_polygons[mm][j][l][0][1] + all_box_coord[mm][0]+page_coord[0]) / self.scale_y))
-                                points_co += str(textline_x_coord) + ',' + str(textline_y_coord)
+                            points_co += str(textline_x_coord) + ',' + str(textline_y_coord)
                         if curved_line and abs(slopes[mm]) <= 45:
                             if len(all_found_texline_polygons[mm][j][l]) == 2:
                                 points_co += str(int((all_found_texline_polygons[mm][j][l][0] + page_coord[2]) / self.scale_x))
@@ -1470,7 +1440,7 @@ class eynollah:
                 id_indexer_l += 1
                 coord = ET.SubElement(textline, 'Coords')
                 add_textequiv(textline)
-                points_co=''
+                points_co = ''
                 for l in range(len(all_found_texline_polygons_marginals[mm][j])):
                     if not self.curved_line:
                         if len(all_found_texline_polygons_marginals[mm][j][l]) == 2:
@@ -1491,8 +1461,8 @@ class eynollah:
                             points_co += ','
                             points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0][1] + page_coord[0]) / self.scale_y))
 
-                    if l<(len(all_found_texline_polygons_marginals[mm][j])-1):
-                        points_co=points_co+' '
+                    if l < len(all_found_texline_polygons_marginals[mm][j]) - 1:
+                        points_co = points_co+' '
                 coord.set('points',points_co)
             add_textequiv(textregion)
 

From 732a27fe778940fa069d0bae390d12c490c0962a Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Tue, 23 Feb 2021 16:36:11 +0100
Subject: [PATCH 56/89] remove unused variables

---
 sbb_newspapers_org_image/eynollah.py | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index 797de1b..c7ae090 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -6,7 +6,6 @@ tool to extract table form data from alto xml data
 import gc
 import math
 import os
-import random
 import sys
 import time
 import warnings
@@ -1659,7 +1658,6 @@ class eynollah:
                     if (x_min_text_only[ii] + 80) >= boxes[jj][0] and (x_min_text_only[ii] + 80) < boxes[jj][1] and y_cor_x_min_main[ii] >= boxes[jj][2] and y_cor_x_min_main[ii] < boxes[jj][3]:
                         arg_text_con.append(jj)
                         break
-            arg_arg_text_con = np.argsort(arg_text_con)
             args_contours = np.array(range(len(arg_text_con)))
 
             arg_text_con_h = []
@@ -1668,7 +1666,6 @@ class eynollah:
                     if (x_min_text_only_h[ii] + 80) >= boxes[jj][0] and (x_min_text_only_h[ii] + 80) < boxes[jj][1] and y_cor_x_min_main_h[ii] >= boxes[jj][2] and y_cor_x_min_main_h[ii] < boxes[jj][3]:
                         arg_text_con_h.append(jj)
                         break
-            arg_arg_text_con = np.argsort(arg_text_con_h)
             args_contours_h = np.array(range(len(arg_text_con_h)))
 
             order_by_con_head = np.zeros(len(arg_text_con_h))
@@ -1738,7 +1735,6 @@ class eynollah:
                     if cx_text_only[ii] >= boxes[jj][0] and cx_text_only[ii] < boxes[jj][1] and cy_text_only[ii] >= boxes[jj][2] and cy_text_only[ii] < boxes[jj][3]:  # this is valid if the center of region identify in which box it is located
                         arg_text_con.append(jj)
                         break
-            arg_arg_text_con = np.argsort(arg_text_con)
             args_contours = np.array(range(len(arg_text_con)))
 
             order_by_con_main = np.zeros(len(arg_text_con))
@@ -1825,7 +1821,6 @@ class eynollah:
                     if (x_min_text_only[ii] + 80) >= boxes[jj][0] and (x_min_text_only[ii] + 80) < boxes[jj][1] and y_cor_x_min_main[ii] >= boxes[jj][2] and y_cor_x_min_main[ii] < boxes[jj][3]:
                         arg_text_con.append(jj)
                         break
-            arg_arg_text_con = np.argsort(arg_text_con)
             args_contours = np.array(range(len(arg_text_con)))
 
             order_by_con_main = np.zeros(len(arg_text_con))
@@ -1849,8 +1844,6 @@ class eynollah:
 
                 indexes_sorted_main = np.array(indexes_sorted)[np.array(kind_of_texts_sorted) == 1]
                 indexes_by_type_main = np.array(index_by_kind_sorted)[np.array(kind_of_texts_sorted) == 1]
-                indexes_sorted_head = np.array(indexes_sorted)[np.array(kind_of_texts_sorted) == 2]
-                indexes_by_type_head = np.array(index_by_kind_sorted)[np.array(kind_of_texts_sorted) == 2]
 
                 zahler = 0
                 for mtv in args_contours_box:
@@ -1880,7 +1873,6 @@ class eynollah:
                     if cx_text_only[ii] >= boxes[jj][0] and cx_text_only[ii] < boxes[jj][1] and cy_text_only[ii] >= boxes[jj][2] and cy_text_only[ii] < boxes[jj][3]:  # this is valid if the center of region identify in which box it is located
                         arg_text_con.append(jj)
                         break
-            arg_arg_text_con = np.argsort(arg_text_con)
             args_contours = np.array(range(len(arg_text_con)))
 
             order_by_con_main = np.zeros(len(arg_text_con))
@@ -2397,9 +2389,9 @@ class eynollah:
                     num_col_d, peaks_neg_fin_d, matrix_of_lines_ch_d, spliter_y_new_d, _ = find_number_of_columns_in_document(np.repeat(text_regions_p_1_n[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines, contours_only_text_parent_h_d_ordered)
             elif self.headers_off:
                 if np.abs(slope_deskew) < SLOPE_THRESHOLD:
-                    num_col, peaks_neg_fin, matrix_of_lines_ch, spliter_y_new, seperators_closeup_n = find_number_of_columns_in_document(np.repeat(text_regions_p[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines)
+                    num_col, peaks_neg_fin, matrix_of_lines_ch, spliter_y_new, _ = find_number_of_columns_in_document(np.repeat(text_regions_p[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines)
                 else:
-                    num_col_d, peaks_neg_fin_d, matrix_of_lines_ch_d, spliter_y_new_d, seperators_closeup_n_d = find_number_of_columns_in_document(np.repeat(text_regions_p_1_n[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines)
+                    num_col_d, peaks_neg_fin_d, matrix_of_lines_ch_d, spliter_y_new_d, _ = find_number_of_columns_in_document(np.repeat(text_regions_p_1_n[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines)
 
             # print(peaks_neg_fin,peaks_neg_fin_d,'num_col2')
             # print(spliter_y_new,spliter_y_new_d,'num_col_classifier')

From 9454a921b1b7cf42244a3c570226d58574a131bf Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Tue, 23 Feb 2021 16:37:13 +0100
Subject: [PATCH 57/89] remove unused
 return_contours_of_interested_region_and_bounding_box

---
 sbb_newspapers_org_image/eynollah.py      |  1 -
 sbb_newspapers_org_image/unused.py        | 21 +++++++++++++++++++++
 sbb_newspapers_org_image/utils/contour.py | 20 --------------------
 3 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index c7ae090..936e1ef 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -40,7 +40,6 @@ from .utils.contour import (
     return_bonding_box_of_contours,
     return_contours_of_image,
     return_contours_of_interested_region,
-    return_contours_of_interested_region_and_bounding_box,
     return_contours_of_interested_region_by_min_size,
     return_contours_of_interested_textline,
     return_parent_contours,
diff --git a/sbb_newspapers_org_image/unused.py b/sbb_newspapers_org_image/unused.py
index 45bfdd7..199c216 100644
--- a/sbb_newspapers_org_image/unused.py
+++ b/sbb_newspapers_org_image/unused.py
@@ -3027,3 +3027,24 @@ def seperate_lines_new(img_path, thetha, num_col, dir_of_all, f_name):
     ##plt.imshow(img_patch_ineterst_revised)
     ##plt.show()
     return img_patch_ineterst_revised
+
+def return_contours_of_interested_region_and_bounding_box(region_pre_p, pixel):
+
+    # pixels of images are identified by 5
+    cnts_images = (region_pre_p[:, :, 0] == pixel) * 1
+    cnts_images = cnts_images.astype(np.uint8)
+    cnts_images = np.repeat(cnts_images[:, :, np.newaxis], 3, axis=2)
+    imgray = cv2.cvtColor(cnts_images, cv2.COLOR_BGR2GRAY)
+    ret, thresh = cv2.threshold(imgray, 0, 255, 0)
+    contours_imgs, hiearchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
+
+    contours_imgs = return_parent_contours(contours_imgs, hiearchy)
+    contours_imgs = filter_contours_area_of_image_tables(thresh, contours_imgs, hiearchy, max_area=1, min_area=0.0003)
+
+    boxes = []
+
+    for jj in range(len(contours_imgs)):
+        x, y, w, h = cv2.boundingRect(contours_imgs[jj])
+        boxes.append([int(x), int(y), int(w), int(h)])
+    return contours_imgs, boxes
+
diff --git a/sbb_newspapers_org_image/utils/contour.py b/sbb_newspapers_org_image/utils/contour.py
index 7c8a283..042cab9 100644
--- a/sbb_newspapers_org_image/utils/contour.py
+++ b/sbb_newspapers_org_image/utils/contour.py
@@ -40,26 +40,6 @@ def find_features_of_contours(contours_main):
 
     return y_min_main, y_max_main, areas_main
 
-def return_contours_of_interested_region_and_bounding_box(region_pre_p, pixel):
-
-    # pixels of images are identified by 5
-    cnts_images = (region_pre_p[:, :, 0] == pixel) * 1
-    cnts_images = cnts_images.astype(np.uint8)
-    cnts_images = np.repeat(cnts_images[:, :, np.newaxis], 3, axis=2)
-    imgray = cv2.cvtColor(cnts_images, cv2.COLOR_BGR2GRAY)
-    ret, thresh = cv2.threshold(imgray, 0, 255, 0)
-    contours_imgs, hiearchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
-
-    contours_imgs = return_parent_contours(contours_imgs, hiearchy)
-    contours_imgs = filter_contours_area_of_image_tables(thresh, contours_imgs, hiearchy, max_area=1, min_area=0.0003)
-
-    boxes = []
-
-    for jj in range(len(contours_imgs)):
-        x, y, w, h = cv2.boundingRect(contours_imgs[jj])
-        boxes.append([int(x), int(y), int(w), int(h)])
-    return contours_imgs, boxes
-
 def get_text_region_boxes_by_given_contours(contours):
 
     kernel = np.ones((5, 5), np.uint8)

From 823592126e63d3c7e51d9b962fc0f71fc45c0ce5 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Tue, 23 Feb 2021 16:38:17 +0100
Subject: [PATCH 58/89] remove unused return_bonding_box_of_contours

---
 sbb_newspapers_org_image/eynollah.py      | 1 -
 sbb_newspapers_org_image/unused.py        | 9 +++++++++
 sbb_newspapers_org_image/utils/contour.py | 9 ---------
 3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index 936e1ef..b9d7735 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -37,7 +37,6 @@ from .utils.contour import (
     find_new_features_of_contoures,
     get_text_region_boxes_by_given_contours,
     get_textregion_contours_in_org_image,
-    return_bonding_box_of_contours,
     return_contours_of_image,
     return_contours_of_interested_region,
     return_contours_of_interested_region_by_min_size,
diff --git a/sbb_newspapers_org_image/unused.py b/sbb_newspapers_org_image/unused.py
index 199c216..80654d4 100644
--- a/sbb_newspapers_org_image/unused.py
+++ b/sbb_newspapers_org_image/unused.py
@@ -3048,3 +3048,12 @@ def return_contours_of_interested_region_and_bounding_box(region_pre_p, pixel):
         boxes.append([int(x), int(y), int(w), int(h)])
     return contours_imgs, boxes
 
+def return_bonding_box_of_contours(cnts):
+    boxes_tot = []
+    for i in range(len(cnts)):
+        x, y, w, h = cv2.boundingRect(cnts[i])
+
+        box = [x, y, w, h]
+        boxes_tot.append(box)
+    return boxes_tot
+
diff --git a/sbb_newspapers_org_image/utils/contour.py b/sbb_newspapers_org_image/utils/contour.py
index 042cab9..a4cc81a 100644
--- a/sbb_newspapers_org_image/utils/contour.py
+++ b/sbb_newspapers_org_image/utils/contour.py
@@ -216,15 +216,6 @@ def return_contours_of_interested_textline(region_pre_p, pixel):
     contours_imgs = filter_contours_area_of_image_tables(thresh, contours_imgs, hiearchy, max_area=1, min_area=0.000000003)
     return contours_imgs
 
-def return_bonding_box_of_contours(cnts):
-    boxes_tot = []
-    for i in range(len(cnts)):
-        x, y, w, h = cv2.boundingRect(cnts[i])
-
-        box = [x, y, w, h]
-        boxes_tot.append(box)
-    return boxes_tot
-
 def return_contours_of_image(image):
 
     if len(image.shape) == 2:

From e2ae6dbd448343ecedb7078770adf6131218c88f Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Tue, 23 Feb 2021 16:39:33 +0100
Subject: [PATCH 59/89] remove unused find_features_of_contours

---
 sbb_newspapers_org_image/eynollah.py      |  1 -
 sbb_newspapers_org_image/unused.py        | 13 +++++++++++++
 sbb_newspapers_org_image/utils/contour.py | 13 -------------
 3 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index b9d7735..a5ebee5 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -33,7 +33,6 @@ from .utils.contour import (
     filter_contours_area_of_image_tables,
     filter_contours_area_of_image,
     find_contours_mean_y_diff,
-    find_features_of_contours,
     find_new_features_of_contoures,
     get_text_region_boxes_by_given_contours,
     get_textregion_contours_in_org_image,
diff --git a/sbb_newspapers_org_image/unused.py b/sbb_newspapers_org_image/unused.py
index 80654d4..f2c4f8d 100644
--- a/sbb_newspapers_org_image/unused.py
+++ b/sbb_newspapers_org_image/unused.py
@@ -3057,3 +3057,16 @@ def return_bonding_box_of_contours(cnts):
         boxes_tot.append(box)
     return boxes_tot
 
+def find_features_of_contours(contours_main):
+
+    areas_main = np.array([cv2.contourArea(contours_main[j]) for j in range(len(contours_main))])
+    M_main = [cv2.moments(contours_main[j]) for j in range(len(contours_main))]
+    cx_main = [(M_main[j]["m10"] / (M_main[j]["m00"] + 1e-32)) for j in range(len(M_main))]
+    cy_main = [(M_main[j]["m01"] / (M_main[j]["m00"] + 1e-32)) for j in range(len(M_main))]
+    x_min_main = np.array([np.min(contours_main[j][:, 0, 0]) for j in range(len(contours_main))])
+    x_max_main = np.array([np.max(contours_main[j][:, 0, 0]) for j in range(len(contours_main))])
+
+    y_min_main = np.array([np.min(contours_main[j][:, 0, 1]) for j in range(len(contours_main))])
+    y_max_main = np.array([np.max(contours_main[j][:, 0, 1]) for j in range(len(contours_main))])
+
+    return y_min_main, y_max_main, areas_main
diff --git a/sbb_newspapers_org_image/utils/contour.py b/sbb_newspapers_org_image/utils/contour.py
index a4cc81a..b5002a8 100644
--- a/sbb_newspapers_org_image/utils/contour.py
+++ b/sbb_newspapers_org_image/utils/contour.py
@@ -26,19 +26,6 @@ def find_contours_mean_y_diff(contours_main):
     cy_main = [(M_main[j]["m01"] / (M_main[j]["m00"] + 1e-32)) for j in range(len(M_main))]
     return np.mean(np.diff(np.sort(np.array(cy_main))))
 
-def find_features_of_contours(contours_main):
-
-    areas_main = np.array([cv2.contourArea(contours_main[j]) for j in range(len(contours_main))])
-    M_main = [cv2.moments(contours_main[j]) for j in range(len(contours_main))]
-    cx_main = [(M_main[j]["m10"] / (M_main[j]["m00"] + 1e-32)) for j in range(len(M_main))]
-    cy_main = [(M_main[j]["m01"] / (M_main[j]["m00"] + 1e-32)) for j in range(len(M_main))]
-    x_min_main = np.array([np.min(contours_main[j][:, 0, 0]) for j in range(len(contours_main))])
-    x_max_main = np.array([np.max(contours_main[j][:, 0, 0]) for j in range(len(contours_main))])
-
-    y_min_main = np.array([np.min(contours_main[j][:, 0, 1]) for j in range(len(contours_main))])
-    y_max_main = np.array([np.max(contours_main[j][:, 0, 1]) for j in range(len(contours_main))])
-
-    return y_min_main, y_max_main, areas_main
 
 def get_text_region_boxes_by_given_contours(contours):
 

From 35838069fcacddd1a107e849dbb84fc6a3ab027a Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Tue, 23 Feb 2021 16:42:53 +0100
Subject: [PATCH 60/89] remove unused filter_contours_area_of_image_interiors

---
 sbb_newspapers_org_image/eynollah.py      | 13 +------------
 sbb_newspapers_org_image/unused.py        | 18 ++++++++++++++++++
 sbb_newspapers_org_image/utils/contour.py | 17 -----------------
 3 files changed, 19 insertions(+), 29 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index a5ebee5..af4212d 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -29,7 +29,6 @@ warnings.filterwarnings("ignore")
 
 from .utils.contour import (
     contours_in_same_horizon,
-    filter_contours_area_of_image_interiors,
     filter_contours_area_of_image_tables,
     filter_contours_area_of_image,
     find_contours_mean_y_diff,
@@ -46,21 +45,11 @@ from .utils.contour import (
 
 from .utils.rotate import (
     rotate_image,
-    rotate_max_area,
-    rotate_max_area_new,
-    rotatedRectWithMaxArea,
-    rotation_image_new,
     rotation_not_90_func,
-    rotation_not_90_func_full_layout,
-    rotyate_image_different,
+    rotation_not_90_func_full_layout
 )
 
 from .utils.separate_lines import (
-    seperate_lines,
-    seperate_lines_new_inside_teils,
-    seperate_lines_new_inside_teils2,
-    seperate_lines_vertical,
-    seperate_lines_vertical_cont,
     textline_contours_postprocessing,
     seperate_lines_new2,
     return_deskew_slop,
diff --git a/sbb_newspapers_org_image/unused.py b/sbb_newspapers_org_image/unused.py
index f2c4f8d..f886e04 100644
--- a/sbb_newspapers_org_image/unused.py
+++ b/sbb_newspapers_org_image/unused.py
@@ -3070,3 +3070,21 @@ def find_features_of_contours(contours_main):
     y_max_main = np.array([np.max(contours_main[j][:, 0, 1]) for j in range(len(contours_main))])
 
     return y_min_main, y_max_main, areas_main
+
+def filter_contours_area_of_image_interiors(image, contours, hirarchy, max_area, min_area):
+    found_polygons_early = list()
+
+    jv = 0
+    for c in contours:
+        if len(c) < 3:  # A polygon cannot have less than 3 points
+            continue
+
+        polygon = geometry.Polygon([point[0] for point in c])
+        area = polygon.area
+        if area >= min_area * np.prod(image.shape[:2]) and area <= max_area * np.prod(image.shape[:2]) and hirarchy[0][jv][3] != -1:
+            # print(c[0][0][1])
+            found_polygons_early.append(np.array([point for point in polygon.exterior.coords], dtype=np.uint))
+        jv += 1
+    return found_polygons_early
+
+
diff --git a/sbb_newspapers_org_image/utils/contour.py b/sbb_newspapers_org_image/utils/contour.py
index b5002a8..06e2ee8 100644
--- a/sbb_newspapers_org_image/utils/contour.py
+++ b/sbb_newspapers_org_image/utils/contour.py
@@ -56,23 +56,6 @@ def filter_contours_area_of_image(image, contours, hirarchy, max_area, min_area)
         jv += 1
     return found_polygons_early
 
-def filter_contours_area_of_image_interiors(image, contours, hirarchy, max_area, min_area):
-    found_polygons_early = list()
-
-    jv = 0
-    for c in contours:
-        if len(c) < 3:  # A polygon cannot have less than 3 points
-            continue
-
-        polygon = geometry.Polygon([point[0] for point in c])
-        area = polygon.area
-        if area >= min_area * np.prod(image.shape[:2]) and area <= max_area * np.prod(image.shape[:2]) and hirarchy[0][jv][3] != -1:
-            # print(c[0][0][1])
-            found_polygons_early.append(np.array([point for point in polygon.exterior.coords], dtype=np.uint))
-        jv += 1
-    return found_polygons_early
-
-
 def filter_contours_area_of_image_tables(image, contours, hirarchy, max_area, min_area):
     found_polygons_early = list()
 

From 68d5c0d5238dfa1de90190848b154664e6ca884c Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Tue, 23 Feb 2021 16:47:59 +0100
Subject: [PATCH 61/89] remove unused
 return_hor_spliter_by_index_for_without_verticals

---
 sbb_newspapers_org_image/eynollah.py          |  8 --
 sbb_newspapers_org_image/unused.py            | 78 ++++++++++++++++++
 sbb_newspapers_org_image/utils/__init__.py    | 79 -------------------
 .../utils/separate_lines.py                   | 23 ------
 4 files changed, 78 insertions(+), 110 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index af4212d..41538b4 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -28,8 +28,6 @@ tf.get_logger().setLevel("ERROR")
 warnings.filterwarnings("ignore")
 
 from .utils.contour import (
-    contours_in_same_horizon,
-    filter_contours_area_of_image_tables,
     filter_contours_area_of_image,
     find_contours_mean_y_diff,
     find_new_features_of_contoures,
@@ -67,15 +65,9 @@ from .utils.resize import resize_image
 from .utils import (
     boosting_headers_by_longshot_region_segmentation,
     crop_image_inside_box,
-    find_features_of_lines,
     find_num_col,
-    find_num_col_by_vertical_lines,
-    find_num_col_deskew,
-    find_num_col_only_image,
-    isNaN,
     otsu_copy,
     otsu_copy_binary,
-    return_hor_spliter_by_index_for_without_verticals,
     delete_seperator_around,
     return_regions_without_seperators,
     put_drop_out_from_only_drop_model,
diff --git a/sbb_newspapers_org_image/unused.py b/sbb_newspapers_org_image/unused.py
index f886e04..1981611 100644
--- a/sbb_newspapers_org_image/unused.py
+++ b/sbb_newspapers_org_image/unused.py
@@ -3087,4 +3087,82 @@ def filter_contours_area_of_image_interiors(image, contours, hirarchy, max_area,
         jv += 1
     return found_polygons_early
 
+def return_hor_spliter_by_index_for_without_verticals(peaks_neg_fin_t, x_min_hor_some, x_max_hor_some):
+    # print(peaks_neg_fin_t,x_min_hor_some,x_max_hor_some)
+    arg_min_hor_sort = np.argsort(x_min_hor_some)
+    x_min_hor_some_sort = np.sort(x_min_hor_some)
+    x_max_hor_some_sort = x_max_hor_some[arg_min_hor_sort]
 
+    arg_minmax = np.array(range(len(peaks_neg_fin_t)))
+    indexer_lines = []
+    indexes_to_delete = []
+    indexer_lines_deletions_len = []
+    indexr_uniq_ind = []
+    for i in range(len(x_min_hor_some_sort)):
+        min_h = peaks_neg_fin_t - x_min_hor_some_sort[i]
+
+        max_h = peaks_neg_fin_t - x_max_hor_some_sort[i]
+
+        min_h[0] = min_h[0]  # +20
+        max_h[len(max_h) - 1] = max_h[len(max_h) - 1] - 20
+
+        min_h_neg = arg_minmax[(min_h < 0)]
+        min_h_neg_n = min_h[min_h < 0]
+
+        try:
+            min_h_neg = [min_h_neg[np.argmax(min_h_neg_n)]]
+        except:
+            min_h_neg = []
+
+        max_h_neg = arg_minmax[(max_h > 0)]
+        max_h_neg_n = max_h[max_h > 0]
+
+        if len(max_h_neg_n) > 0:
+            max_h_neg = [max_h_neg[np.argmin(max_h_neg_n)]]
+        else:
+            max_h_neg = []
+
+        if len(min_h_neg) > 0 and len(max_h_neg) > 0:
+            deletions = list(range(min_h_neg[0] + 1, max_h_neg[0]))
+            unique_delets_int = []
+            # print(deletions,len(deletions),'delii')
+            if len(deletions) > 0:
+
+                for j in range(len(deletions)):
+                    indexes_to_delete.append(deletions[j])
+                    # print(deletions,indexes_to_delete,'badiii')
+                    unique_delets = np.unique(indexes_to_delete)
+                    # print(min_h_neg[0],unique_delets)
+                    unique_delets_int = unique_delets[unique_delets < min_h_neg[0]]
+
+                indexer_lines_deletions_len.append(len(deletions))
+                indexr_uniq_ind.append([deletions])
+
+            else:
+                indexer_lines_deletions_len.append(0)
+                indexr_uniq_ind.append(-999)
+
+            index_line_true = min_h_neg[0] - len(unique_delets_int)
+            # print(index_line_true)
+            if index_line_true > 0 and min_h_neg[0] >= 2:
+                index_line_true = index_line_true
+            else:
+                index_line_true = min_h_neg[0]
+
+            indexer_lines.append(index_line_true)
+
+            if len(unique_delets_int) > 0:
+                for dd in range(len(unique_delets_int)):
+                    indexes_to_delete.append(unique_delets_int[dd])
+        else:
+            indexer_lines.append(-999)
+            indexer_lines_deletions_len.append(-999)
+            indexr_uniq_ind.append(-999)
+
+    peaks_true = []
+    for m in range(len(peaks_neg_fin_t)):
+        if m in indexes_to_delete:
+            pass
+        else:
+            peaks_true.append(peaks_neg_fin_t[m])
+    return indexer_lines, peaks_true, arg_min_hor_sort, indexer_lines_deletions_len, indexr_uniq_ind
diff --git a/sbb_newspapers_org_image/utils/__init__.py b/sbb_newspapers_org_image/utils/__init__.py
index 781864d..daf6edd 100644
--- a/sbb_newspapers_org_image/utils/__init__.py
+++ b/sbb_newspapers_org_image/utils/__init__.py
@@ -376,85 +376,6 @@ def find_num_col_deskew(regions_without_seperators, sigma_, multiplier=3.8):
     z = gaussian_filter1d(regions_without_seperators_0, sigma_)
     return np.std(z)
 
-def return_hor_spliter_by_index_for_without_verticals(peaks_neg_fin_t, x_min_hor_some, x_max_hor_some):
-    # print(peaks_neg_fin_t,x_min_hor_some,x_max_hor_some)
-    arg_min_hor_sort = np.argsort(x_min_hor_some)
-    x_min_hor_some_sort = np.sort(x_min_hor_some)
-    x_max_hor_some_sort = x_max_hor_some[arg_min_hor_sort]
-
-    arg_minmax = np.array(range(len(peaks_neg_fin_t)))
-    indexer_lines = []
-    indexes_to_delete = []
-    indexer_lines_deletions_len = []
-    indexr_uniq_ind = []
-    for i in range(len(x_min_hor_some_sort)):
-        min_h = peaks_neg_fin_t - x_min_hor_some_sort[i]
-
-        max_h = peaks_neg_fin_t - x_max_hor_some_sort[i]
-
-        min_h[0] = min_h[0]  # +20
-        max_h[len(max_h) - 1] = max_h[len(max_h) - 1] - 20
-
-        min_h_neg = arg_minmax[(min_h < 0)]
-        min_h_neg_n = min_h[min_h < 0]
-
-        try:
-            min_h_neg = [min_h_neg[np.argmax(min_h_neg_n)]]
-        except:
-            min_h_neg = []
-
-        max_h_neg = arg_minmax[(max_h > 0)]
-        max_h_neg_n = max_h[max_h > 0]
-
-        if len(max_h_neg_n) > 0:
-            max_h_neg = [max_h_neg[np.argmin(max_h_neg_n)]]
-        else:
-            max_h_neg = []
-
-        if len(min_h_neg) > 0 and len(max_h_neg) > 0:
-            deletions = list(range(min_h_neg[0] + 1, max_h_neg[0]))
-            unique_delets_int = []
-            # print(deletions,len(deletions),'delii')
-            if len(deletions) > 0:
-
-                for j in range(len(deletions)):
-                    indexes_to_delete.append(deletions[j])
-                    # print(deletions,indexes_to_delete,'badiii')
-                    unique_delets = np.unique(indexes_to_delete)
-                    # print(min_h_neg[0],unique_delets)
-                    unique_delets_int = unique_delets[unique_delets < min_h_neg[0]]
-
-                indexer_lines_deletions_len.append(len(deletions))
-                indexr_uniq_ind.append([deletions])
-
-            else:
-                indexer_lines_deletions_len.append(0)
-                indexr_uniq_ind.append(-999)
-
-            index_line_true = min_h_neg[0] - len(unique_delets_int)
-            # print(index_line_true)
-            if index_line_true > 0 and min_h_neg[0] >= 2:
-                index_line_true = index_line_true
-            else:
-                index_line_true = min_h_neg[0]
-
-            indexer_lines.append(index_line_true)
-
-            if len(unique_delets_int) > 0:
-                for dd in range(len(unique_delets_int)):
-                    indexes_to_delete.append(unique_delets_int[dd])
-        else:
-            indexer_lines.append(-999)
-            indexer_lines_deletions_len.append(-999)
-            indexr_uniq_ind.append(-999)
-
-    peaks_true = []
-    for m in range(len(peaks_neg_fin_t)):
-        if m in indexes_to_delete:
-            pass
-        else:
-            peaks_true.append(peaks_neg_fin_t[m])
-    return indexer_lines, peaks_true, arg_min_hor_sort, indexer_lines_deletions_len, indexr_uniq_ind
 
 def find_num_col(regions_without_seperators, multiplier=3.8):
     regions_without_seperators_0 = regions_without_seperators[:, :].sum(axis=0)
diff --git a/sbb_newspapers_org_image/utils/separate_lines.py b/sbb_newspapers_org_image/utils/separate_lines.py
index 071116b..a1095b0 100644
--- a/sbb_newspapers_org_image/utils/separate_lines.py
+++ b/sbb_newspapers_org_image/utils/separate_lines.py
@@ -13,31 +13,8 @@ from .contour import (
 )
 from .is_nan import isNaN
 from . import (
-    boosting_headers_by_longshot_region_segmentation,
-    crop_image_inside_box,
-    find_features_of_lines,
-    find_num_col,
-    find_num_col_by_vertical_lines,
     find_num_col_deskew,
-    find_num_col_only_image,
     isNaN,
-    otsu_copy,
-    otsu_copy_binary,
-    return_hor_spliter_by_index_for_without_verticals,
-    delete_seperator_around,
-    return_regions_without_seperators,
-    put_drop_out_from_only_drop_model,
-    putt_bb_of_drop_capitals_of_model_in_patches_in_layout,
-    check_any_text_region_in_model_one_is_main_or_header,
-    small_textlines_to_parent_adherence2,
-    order_and_id_of_texts,
-    order_of_regions,
-    implent_law_head_main_not_parallel,
-    return_hor_spliter_by_index,
-    combine_hor_lines_and_delete_cross_points_and_get_lines_features_back_new,
-    return_points_with_boundies,
-    find_number_of_columns_in_document,
-    return_boxes_of_images_by_order_of_reading_new,
 )
 
 def dedup_separate_lines(img_patch, contour_text_interest, thetha, axis):

From a65caa4d25323b87c91fe02a8e0077652f71c849 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Tue, 23 Feb 2021 17:11:32 +0100
Subject: [PATCH 62/89] :art: unncesssary if True

---
 sbb_newspapers_org_image/eynollah.py | 155 ++++++++++++---------------
 1 file changed, 71 insertions(+), 84 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index 41538b4..5798c63 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -171,93 +171,81 @@ class eynollah:
 
         if img.shape[1] < img_width_model:
             img = cv2.resize(img, (img_height_model, img.shape[0]), interpolation=cv2.INTER_NEAREST)
+        margin = int(0 * img_width_model)
+        width_mid = img_width_model - 2 * margin
+        height_mid = img_height_model - 2 * margin
+        img = img / float(255.0)
 
-        margin = True
+        img_h = img.shape[0]
+        img_w = img.shape[1]
 
-        if margin:
-            kernel = np.ones((5, 5), np.uint8)
+        prediction_true = np.zeros((img_h, img_w, 3))
+        mask_true = np.zeros((img_h, img_w))
+        nxf = img_w / float(width_mid)
+        nyf = img_h / float(height_mid)
 
-            margin = int(0 * img_width_model)
+        nxf = int(nxf) + 1 if nxf > int(nxf) else int(nxf)
+        nyf = int(nyf) + 1 if nyf > int(nyf) else int(nyf)
 
-            width_mid = img_width_model - 2 * margin
-            height_mid = img_height_model - 2 * margin
+        for i in range(nxf):
+            for j in range(nyf):
+                if i == 0:
+                    index_x_d = i * width_mid
+                    index_x_u = index_x_d + img_width_model
+                else:
+                    index_x_d = i * width_mid
+                    index_x_u = index_x_d + img_width_model
+                if j == 0:
+                    index_y_d = j * height_mid
+                    index_y_u = index_y_d + img_height_model
+                else:
+                    index_y_d = j * height_mid
+                    index_y_u = index_y_d + img_height_model
 
-            img = img / float(255.0)
+                if index_x_u > img_w:
+                    index_x_u = img_w
+                    index_x_d = img_w - img_width_model
+                if index_y_u > img_h:
+                    index_y_u = img_h
+                    index_y_d = img_h - img_height_model
 
-            img_h = img.shape[0]
-            img_w = img.shape[1]
+                img_patch = img[index_y_d:index_y_u, index_x_d:index_x_u, :]
+                label_p_pred = model_enhancement.predict(img_patch.reshape(1, img_patch.shape[0], img_patch.shape[1], img_patch.shape[2]))
 
-            prediction_true = np.zeros((img_h, img_w, 3))
-            mask_true = np.zeros((img_h, img_w))
-            nxf = img_w / float(width_mid)
-            nyf = img_h / float(height_mid)
+                seg = label_p_pred[0, :, :, :]
+                seg = seg * 255
 
-            nxf = int(nxf) + 1 if nxf > int(nxf) else int(nxf)
-            nyf = int(nyf) + 1 if nyf > int(nyf) else int(nyf)
+                if i == 0 and j == 0:
+                    seg = seg[0 : seg.shape[0] - margin, 0 : seg.shape[1] - margin]
+                    prediction_true[index_y_d + 0 : index_y_u - margin, index_x_d + 0 : index_x_u - margin, :] = seg
+                elif i == nxf - 1 and j == nyf - 1:
+                    seg = seg[margin : seg.shape[0] - 0, margin : seg.shape[1] - 0]
+                    prediction_true[index_y_d + margin : index_y_u - 0, index_x_d + margin : index_x_u - 0, :] = seg
+                elif i == 0 and j == nyf - 1:
+                    seg = seg[margin : seg.shape[0] - 0, 0 : seg.shape[1] - margin]
+                    prediction_true[index_y_d + margin : index_y_u - 0, index_x_d + 0 : index_x_u - margin, :] = seg
+                elif i == nxf - 1 and j == 0:
+                    seg = seg[0 : seg.shape[0] - margin, margin : seg.shape[1] - 0]
+                    prediction_true[index_y_d + 0 : index_y_u - margin, index_x_d + margin : index_x_u - 0, :] = seg
+                elif i == 0 and j != 0 and j != nyf - 1:
+                    seg = seg[margin : seg.shape[0] - margin, 0 : seg.shape[1] - margin]
+                    prediction_true[index_y_d + margin : index_y_u - margin, index_x_d + 0 : index_x_u - margin, :] = seg
+                elif i == nxf - 1 and j != 0 and j != nyf - 1:
+                    seg = seg[margin : seg.shape[0] - margin, margin : seg.shape[1] - 0]
+                    prediction_true[index_y_d + margin : index_y_u - margin, index_x_d + margin : index_x_u - 0, :] = seg
+                elif i != 0 and i != nxf - 1 and j == 0:
+                    seg = seg[0 : seg.shape[0] - margin, margin : seg.shape[1] - margin]
+                    prediction_true[index_y_d + 0 : index_y_u - margin, index_x_d + margin : index_x_u - margin, :] = seg
+                elif i != 0 and i != nxf - 1 and j == nyf - 1:
+                    seg = seg[margin : seg.shape[0] - 0, margin : seg.shape[1] - margin]
+                    prediction_true[index_y_d + margin : index_y_u - 0, index_x_d + margin : index_x_u - margin, :] = seg
+                else:
+                    seg = seg[margin : seg.shape[0] - margin, margin : seg.shape[1] - margin]
+                    prediction_true[index_y_d + margin : index_y_u - margin, index_x_d + margin : index_x_u - margin, :] = seg
 
-            for i in range(nxf):
-                for j in range(nyf):
-                    if i == 0:
-                        index_x_d = i * width_mid
-                        index_x_u = index_x_d + img_width_model
-                    else:
-                        index_x_d = i * width_mid
-                        index_x_u = index_x_d + img_width_model
+        prediction_true = prediction_true.astype(int)
 
-                    if j == 0:
-                        index_y_d = j * height_mid
-                        index_y_u = index_y_d + img_height_model
-                    else:
-                        index_y_d = j * height_mid
-                        index_y_u = index_y_d + img_height_model
-
-                    if index_x_u > img_w:
-                        index_x_u = img_w
-                        index_x_d = img_w - img_width_model
-                    if index_y_u > img_h:
-                        index_y_u = img_h
-                        index_y_d = img_h - img_height_model
-
-                    img_patch = img[index_y_d:index_y_u, index_x_d:index_x_u, :]
-                    label_p_pred = model_enhancement.predict(img_patch.reshape(1, img_patch.shape[0], img_patch.shape[1], img_patch.shape[2]))
-
-                    seg = label_p_pred[0, :, :, :]
-                    seg = seg * 255
-
-                    if i == 0 and j == 0:
-                        seg = seg[0 : seg.shape[0] - margin, 0 : seg.shape[1] - margin]
-                        prediction_true[index_y_d + 0 : index_y_u - margin, index_x_d + 0 : index_x_u - margin, :] = seg
-                    elif i == nxf - 1 and j == nyf - 1:
-                        seg = seg[margin : seg.shape[0] - 0, margin : seg.shape[1] - 0]
-                        prediction_true[index_y_d + margin : index_y_u - 0, index_x_d + margin : index_x_u - 0, :] = seg
-                    elif i == 0 and j == nyf - 1:
-                        seg = seg[margin : seg.shape[0] - 0, 0 : seg.shape[1] - margin]
-                        prediction_true[index_y_d + margin : index_y_u - 0, index_x_d + 0 : index_x_u - margin, :] = seg
-                    elif i == nxf - 1 and j == 0:
-                        seg = seg[0 : seg.shape[0] - margin, margin : seg.shape[1] - 0]
-                        prediction_true[index_y_d + 0 : index_y_u - margin, index_x_d + margin : index_x_u - 0, :] = seg
-                    elif i == 0 and j != 0 and j != nyf - 1:
-                        seg = seg[margin : seg.shape[0] - margin, 0 : seg.shape[1] - margin]
-                        prediction_true[index_y_d + margin : index_y_u - margin, index_x_d + 0 : index_x_u - margin, :] = seg
-                    elif i == nxf - 1 and j != 0 and j != nyf - 1:
-                        seg = seg[margin : seg.shape[0] - margin, margin : seg.shape[1] - 0]
-                        prediction_true[index_y_d + margin : index_y_u - margin, index_x_d + margin : index_x_u - 0, :] = seg
-                    elif i != 0 and i != nxf - 1 and j == 0:
-                        seg = seg[0 : seg.shape[0] - margin, margin : seg.shape[1] - margin]
-                        prediction_true[index_y_d + 0 : index_y_u - margin, index_x_d + margin : index_x_u - margin, :] = seg
-                    elif i != 0 and i != nxf - 1 and j == nyf - 1:
-                        seg = seg[margin : seg.shape[0] - 0, margin : seg.shape[1] - margin]
-                        prediction_true[index_y_d + margin : index_y_u - 0, index_x_d + margin : index_x_u - margin, :] = seg
-                    else:
-                        seg = seg[margin : seg.shape[0] - margin, margin : seg.shape[1] - margin]
-                        prediction_true[index_y_d + margin : index_y_u - margin, index_x_d + margin : index_x_u - margin, :] = seg
-
-            prediction_true = prediction_true.astype(int)
-
-            del model_enhancement
-            del session_enhancemnet
-
-            return prediction_true
+        return prediction_true
 
     def calculate_width_height_by_columns(self, img, num_col, width_early, label_p_pred):
         self.logger.debug("enter calculate_width_height_by_columns")
@@ -1252,7 +1240,6 @@ class eynollah:
         id_indexer_l = 0
         if len(found_polygons_text_region) > 0:
             self.xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia, found_polygons_marginals)
-
             for mm in range(len(found_polygons_text_region)):
                 textregion = ET.SubElement(page, 'TextRegion')
                 textregion.set('id', 'r%s' % id_indexer)
@@ -1282,9 +1269,9 @@ class eynollah:
                                 points_co += ','
                                 points_co += str(int((all_found_texline_polygons[mm][j][l][1] + page_coord[0]) / self.scale_y))
                             else:
-                                points_co = points_co + str(int((all_found_texline_polygons[mm][j][l][0][0] + page_coord[2]) / self.scale_x))
-                                points_co = points_co + ','
-                                points_co = points_co + str(int((all_found_texline_polygons[mm][j][l][0][1] + page_coord[0]) / self.scale_y))
+                                points_co += str(int((all_found_texline_polygons[mm][j][l][0][0] + page_coord[2]) / self.scale_x))
+                                points_co += ','
+                                points_co += str(int((all_found_texline_polygons[mm][j][l][0][1] + page_coord[0]) / self.scale_y))
                         elif curved_line and abs(slopes[mm]) > 45:
                             if len(all_found_texline_polygons[mm][j][l]) == 2:
                                 points_co += str(int((all_found_texline_polygons[mm][j][l][0] + all_box_coord[mm][2] + page_coord[2]) / self.scale_x))
@@ -1298,7 +1285,6 @@ class eynollah:
                         if l < len(all_found_texline_polygons[mm][j]) - 1:
                             points_co += ' '
                     coord.set('points', points_co)
-
                 add_textequiv(textregion)
 
         for mm in range(len(found_polygons_marginals)):
@@ -2002,12 +1988,13 @@ class eynollah:
         text_regions_p = text_regions_p_1[:, :]  # long_short_region[:,:]#self.get_regions_from_2_models(image_page)
         text_regions_p = np.array(text_regions_p)
 
-        if num_col_classifier == 1 or num_col_classifier == 2:
+        if num_col_classifier in (1, 2):
             try:
                 regions_without_seperators = (text_regions_p[:, :] == 1) * 1
                 regions_without_seperators = regions_without_seperators.astype(np.uint8)
                 text_regions_p = get_marginals(rotate_image(regions_without_seperators, slope_deskew), text_regions_p, num_col_classifier, slope_deskew, kernel=self.kernel)
-            except:
+            except Exception as e:
+                self.logger.error("exception %s", e)
                 pass
 
         if self.plotter:

From b44d298d1631e39d1d761d4b39ff44219c1cfb62 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Tue, 23 Feb 2021 17:20:35 +0100
Subject: [PATCH 63/89] :memo: readme: compare full vs no-full layout

---
 README.md | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/README.md b/README.md
index 15e0cfe..88de093 100644
--- a/README.md
+++ b/README.md
@@ -88,6 +88,21 @@ eynollah \
 
 The tool does accept and works better on original images (RGB format) than binarized images.
 
+### `--full-layout` vs `--no-full-layout`
+
+Here are the difference in elements detected depending on the `--full-layout`/`--no-full-layout` command line flags:
+
+|                          | `--full-layout` | `--no-full-layout` |
+| ---                      | ---             | ---                |
+| reading order            | x               | x                  |
+| header regions           | x               | -                  |
+| text regions             | x               | x                  |
+| text regions / text line | x               | x                  |
+| drop-capitals            | x               | -                  |
+| marginals                | x               | x                  |
+| marginals / text line    | x               | x                  |
+| image region             | x               | x                  |
+
 ### How to use
 
 First of all, this model makes use of up to 9 trained models which are responsible for different operations like size detection, column classification, image enhancement, page extraction, main layout detection, full layout detection and textline detection. But this does not mean that all 9 models are always required for every document. Based on the document characteristics and parameters specified, different scenarios can be applied.

From 718efabf89c765ce1d6fe4a8e31e0a3793ee2fa9 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Tue, 23 Feb 2021 17:28:23 +0100
Subject: [PATCH 64/89] :art: move xml methods together

---
 sbb_newspapers_org_image/eynollah.py | 114 +++++++++++++--------------
 1 file changed, 57 insertions(+), 57 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index 5798c63..0dc4574 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -1122,6 +1122,63 @@ class eynollah:
         poly.put(poly_sub)
         box_sub.put(boxes_sub_new)
 
+    def calculate_polygon_coords(self, contour_list, i, page_coord):
+        self.logger.debug('enter calculate_polygon_coords')
+        coords = ''
+        for j in range(len(contour_list[i])):
+            if len(contour_list[i][j]) == 2:
+                coords += str(int((contour_list[i][j][0] + page_coord[2]) / self.scale_x))
+                coords += ','
+                coords += str(int((contour_list[i][j][1] + page_coord[0]) / self.scale_y))
+            else:
+                coords += str(int((contour_list[i][j][0][0] + page_coord[2]) / self.scale_x))
+                coords += ','
+                coords += str(int((contour_list[i][j][0][1] + page_coord[0]) / self.scale_y))
+
+            if j < len(contour_list[i]) - 1:
+                coords=coords + ' '
+        #print(coords)
+        return coords
+
+    def calculate_page_coords(self):
+        self.logger.debug('enter calculate_page_coords')
+        points_page_print = ""
+        for lmm in range(len(self.cont_page[0])):
+            if len(self.cont_page[0][lmm]) == 2:
+                points_page_print += str(int((self.cont_page[0][lmm][0] ) / self.scale_x))
+                points_page_print += ','
+                points_page_print += str(int((self.cont_page[0][lmm][1] ) / self.scale_y))
+            else:
+                points_page_print += str(int((self.cont_page[0][lmm][0][0]) / self.scale_x))
+                points_page_print += ','
+                points_page_print += str(int((self.cont_page[0][lmm][0][1] ) / self.scale_y))
+
+            if lmm < len( self.cont_page[0] ) - 1:
+                points_page_print = points_page_print + ' '
+        return points_page_print
+
+    def xml_reading_order(self, page, order_of_texts, id_of_texts, id_of_marginalia, found_polygons_marginals):
+        """
+        XXX side-effect: extends id_of_marginalia
+        """
+        region_order = ET.SubElement(page, 'ReadingOrder')
+        region_order_sub = ET.SubElement(region_order, 'OrderedGroup')
+        region_order_sub.set('id', "ro357564684568544579089")
+        indexer_region = 0
+        for vj in order_of_texts:
+            name = "coord_text_%s" % vj
+            name = ET.SubElement(region_order_sub, 'RegionRefIndexed')
+            name.set('index', str(indexer_region))
+            name.set('regionRef', id_of_texts[vj])
+            indexer_region+=1
+        for vm in range(len(found_polygons_marginals)):
+            id_of_marginalia.append('r%s' % indexer_region)
+            name = "coord_text_%s" % indexer_region
+            name = ET.SubElement(region_order_sub, 'RegionRefIndexed')
+            name.set('index', str(indexer_region))
+            name.set('regionRef', 'r%s' % indexer_region)
+            indexer_region += 1
+
     def serialize_lines_in_region(self, textregion, all_found_texline_polygons, region_idx, page_coord, all_box_coord, slopes, id_indexer_l):
         self.logger.debug('enter serialize_lines_in_region')
         for j in range(len(all_found_texline_polygons[region_idx])):
@@ -1168,63 +1225,6 @@ class eynollah:
             coord.set('points',points_co)
         return id_indexer_l
 
-    def calculate_polygon_coords(self, contour_list, i, page_coord):
-        self.logger.debug('enter calculate_polygon_coords')
-        coords = ''
-        for j in range(len(contour_list[i])):
-            if len(contour_list[i][j]) == 2:
-                coords += str(int((contour_list[i][j][0] + page_coord[2]) / self.scale_x))
-                coords += ','
-                coords += str(int((contour_list[i][j][1] + page_coord[0]) / self.scale_y))
-            else:
-                coords += str(int((contour_list[i][j][0][0] + page_coord[2]) / self.scale_x))
-                coords += ','
-                coords += str(int((contour_list[i][j][0][1] + page_coord[0]) / self.scale_y))
-
-            if j < len(contour_list[i]) - 1:
-                coords=coords+' '
-        #print(coords)
-        return coords
-
-    def calculate_page_coords(self):
-        self.logger.debug('enter calculate_page_coords')
-        points_page_print = ""
-        for lmm in range(len(self.cont_page[0])):
-            if len(self.cont_page[0][lmm]) == 2:
-                points_page_print += str(int((self.cont_page[0][lmm][0] ) / self.scale_x))
-                points_page_print += ','
-                points_page_print += str(int((self.cont_page[0][lmm][1] ) / self.scale_y))
-            else:
-                points_page_print += str(int((self.cont_page[0][lmm][0][0]) / self.scale_x))
-                points_page_print += ','
-                points_page_print += str(int((self.cont_page[0][lmm][0][1] ) / self.scale_y))
-
-            if lmm < len( self.cont_page[0] ) - 1:
-                points_page_print = points_page_print + ' '
-        return points_page_print
-
-    def xml_reading_order(self, page, order_of_texts, id_of_texts, id_of_marginalia, found_polygons_marginals):
-        """
-        XXX side-effect: extends id_of_marginalia
-        """
-        region_order = ET.SubElement(page, 'ReadingOrder')
-        region_order_sub = ET.SubElement(region_order, 'OrderedGroup')
-        region_order_sub.set('id', "ro357564684568544579089")
-        indexer_region = 0
-        for vj in order_of_texts:
-            name = "coord_text_%s" % vj
-            name = ET.SubElement(region_order_sub, 'RegionRefIndexed')
-            name.set('index', str(indexer_region))
-            name.set('regionRef', id_of_texts[vj])
-            indexer_region+=1
-        for vm in range(len(found_polygons_marginals)):
-            id_of_marginalia.append('r%s' % indexer_region)
-            name = "coord_text_%s" % indexer_region
-            name = ET.SubElement(region_order_sub, 'RegionRefIndexed')
-            name.set('index', str(indexer_region))
-            name.set('regionRef', 'r%s' % indexer_region)
-            indexer_region += 1
-
 
     def write_into_page_xml(self, found_polygons_text_region, page_coord, dir_of_image, order_of_texts, id_of_texts, all_found_texline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, curved_line, slopes, slopes_marginals):
         self.logger.debug('enter write_into_page_xml')

From cbbd7fc4f0f68273ef9b8eed248174915b5e16bc Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Tue, 23 Feb 2021 17:55:00 +0100
Subject: [PATCH 65/89] make line serialization uniform

---
 sbb_newspapers_org_image/eynollah.py | 155 ++++++++-------------------
 1 file changed, 46 insertions(+), 109 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index 0dc4574..cee5220 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -1179,6 +1179,38 @@ class eynollah:
             name.set('regionRef', 'r%s' % indexer_region)
             indexer_region += 1
 
+    def serialize_lines_in_marginal(self, marginal, all_found_texline_polygons_marginals, marginal_idx, page_coord, all_box_coord_marginals, id_indexer_l):
+        for j in range(len(all_found_texline_polygons_marginals[marginal_idx])):
+            textline = ET.SubElement(marginal, 'TextLine')
+            textline.set('id', 'l%s' % id_indexer_l)
+            id_indexer_l += 1
+            coord = ET.SubElement(textline, 'Coords')
+            add_textequiv(textline)
+            points_co = ''
+            for l in range(len(all_found_texline_polygons_marginals[marginal_idx][j])):
+                if not self.curved_line:
+                    if len(all_found_texline_polygons_marginals[marginal_idx][j][l]) == 2:
+                        points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x))
+                        points_co += ','
+                        points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0]) / self.scale_y))
+                    else:
+                        points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x))
+                        points_co += ','
+                        points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0])/self.scale_y))
+                else:
+                    if len(all_found_texline_polygons_marginals[marginal_idx][j][l]) == 2:
+                        points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0] + page_coord[2]) / self.scale_x))
+                        points_co += ','
+                        points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][1] + page_coord[0]) / self.scale_y))
+                    else:
+                        points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][0] + page_coord[2]) / self.scale_x))
+                        points_co += ','
+                        points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][1] + page_coord[0]) / self.scale_y))
+                if l < len(all_found_texline_polygons_marginals[marginal_idx][j]) - 1:
+                    points_co += ' '
+            coord.set('points',points_co)
+        return id_indexer_l
+
     def serialize_lines_in_region(self, textregion, all_found_texline_polygons, region_idx, page_coord, all_box_coord, slopes, id_indexer_l):
         self.logger.debug('enter serialize_lines_in_region')
         for j in range(len(all_found_texline_polygons[region_idx])):
@@ -1247,81 +1279,16 @@ class eynollah:
                 textregion.set('type', 'paragraph')
                 coord_text = ET.SubElement(textregion, 'Coords')
                 coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region, mm, page_coord))
-                for j in range(len(all_found_texline_polygons[mm])):
-                    textline = ET.SubElement(textregion, 'TextLine')
-                    textline.set('id', 'l%s'  % id_indexer_l)
-                    id_indexer_l += 1
-                    coord = ET.SubElement(textline, 'Coords')
-                    add_textequiv(textline)
-                    points_co = ''
-                    for l in range(len(all_found_texline_polygons[mm][j])):
-                        if not curved_line:
-                            if len(all_found_texline_polygons[mm][j][l]) == 2:
-                                textline_x_coord = max(0, int((all_found_texline_polygons[mm][j][l][0] + all_box_coord[mm][2] + page_coord[2]) / self.scale_x))
-                                textline_y_coord = max(0, int((all_found_texline_polygons[mm][j][l][1] + all_box_coord[mm][0] + page_coord[0]) / self.scale_y))
-                            else:
-                                textline_x_coord = max(0, int((all_found_texline_polygons[mm][j][l][0][0] + all_box_coord[mm][2]+page_coord[2]) / self.scale_x))
-                                textline_y_coord = max(0, int((all_found_texline_polygons[mm][j][l][0][1] + all_box_coord[mm][0]+page_coord[0]) / self.scale_y))
-                            points_co += str(textline_x_coord) + ',' + str(textline_y_coord)
-                        if curved_line and abs(slopes[mm]) <= 45:
-                            if len(all_found_texline_polygons[mm][j][l]) == 2:
-                                points_co += str(int((all_found_texline_polygons[mm][j][l][0] + page_coord[2]) / self.scale_x))
-                                points_co += ','
-                                points_co += str(int((all_found_texline_polygons[mm][j][l][1] + page_coord[0]) / self.scale_y))
-                            else:
-                                points_co += str(int((all_found_texline_polygons[mm][j][l][0][0] + page_coord[2]) / self.scale_x))
-                                points_co += ','
-                                points_co += str(int((all_found_texline_polygons[mm][j][l][0][1] + page_coord[0]) / self.scale_y))
-                        elif curved_line and abs(slopes[mm]) > 45:
-                            if len(all_found_texline_polygons[mm][j][l]) == 2:
-                                points_co += str(int((all_found_texline_polygons[mm][j][l][0] + all_box_coord[mm][2] + page_coord[2]) / self.scale_x))
-                                points_co += ','
-                                points_co += str(int((all_found_texline_polygons[mm][j][l][1] + all_box_coord[mm][0] + page_coord[0]) / self.scale_y))
-                            else:
-                                points_co += str(int((all_found_texline_polygons[mm][j][l][0][0] + all_box_coord[mm][2] + page_coord[2]) / self.scale_x))
-                                points_co += ','
-                                points_co += str(int((all_found_texline_polygons[mm][j][l][0][1] + all_box_coord[mm][0] + page_coord[0]) / self.scale_y))
-
-                        if l < len(all_found_texline_polygons[mm][j]) - 1:
-                            points_co += ' '
-                    coord.set('points', points_co)
+                id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, id_indexer_l)
                 add_textequiv(textregion)
 
-        for mm in range(len(found_polygons_marginals)):
-            textregion = ET.SubElement(page, 'TextRegion')
-            textregion.set('id', id_of_marginalia[mm])
-            textregion.set('type', 'marginalia')
-            coord_text = ET.SubElement(textregion, 'Coords')
+        for marginal_idx in range(len(found_polygons_marginals)):
+            marginal = ET.SubElement(page, 'TextRegion')
+            marginal.set('id', id_of_marginalia[mm])
+            marginal.set('type', 'marginalia')
+            coord_text = ET.SubElement(marginal, 'Coords')
             coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals, mm, page_coord))
-            for j in range(len(all_found_texline_polygons_marginals[mm])):
-                textline = ET.SubElement(textregion, 'TextLine')
-                textline.set('id','l'+str(id_indexer_l))
-                id_indexer_l += 1
-                coord = ET.SubElement(textline, 'Coords')
-                add_textequiv(textline)
-                points_co = ''
-                for l in range(len(all_found_texline_polygons_marginals[mm][j])):
-                    if not curved_line:
-                        if len(all_found_texline_polygons_marginals[mm][j][l]) == 2:
-                            points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0] + all_box_coord_marginals[mm][2] + page_coord[2]) / self.scale_x))
-                            points_co += ','
-                            points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][1] + all_box_coord_marginals[mm][0] + page_coord[0]) / self.scale_y))
-                        else:
-                            points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0][0] + all_box_coord_marginals[mm][2] + page_coord[2]) / self.scale_x))
-                            points_co += ','
-                            points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0][1] + all_box_coord_marginals[mm][0] + page_coord[0])/self.scale_y))
-                    else:
-                        if len(all_found_texline_polygons_marginals[mm][j][l]) == 2:
-                            points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0] + page_coord[2]) / self.scale_x))
-                            points_co += ','
-                            points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][1] + page_coord[0]) / self.scale_y))
-                        else:
-                            points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0][0] + page_coord[2]) / self.scale_x))
-                            points_co += ','
-                            points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0][1] + page_coord[0]) / self.scale_y))
-                    if l < len(all_found_texline_polygons_marginals[mm][j]) - 1:
-                        points_co += ' '
-                coord.set('points',points_co)
+            self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, marginal_idx, page_coord, all_box_coord_marginals, id_indexer_l)
 
         id_indexer = len(found_polygons_text_region) + len(found_polygons_marginals)
         for mm in range(len(found_polygons_text_region_img)):
@@ -1390,44 +1357,14 @@ class eynollah:
                 coord_text.set('points', self.calculate_polygon_coords(found_polygons_drop_capitals, mm, page_coord))
                 add_textequiv(textregion)
 
-        for mm in range(len(found_polygons_marginals)):
-            textregion = ET.SubElement(page, 'TextRegion')
-            textregion.set('id', id_of_marginalia[mm])
-            textregion.set('type', 'marginalia')
-            coord_text = ET.SubElement(textregion, 'Coords')
-            coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals, mm, page_coord))
-
-            for j in range(len(all_found_texline_polygons_marginals[mm])):
-                textline = ET.SubElement(textregion, 'TextLine')
-                textline.set('id', 'l%s' % id_indexer_l)
-                id_indexer_l += 1
-                coord = ET.SubElement(textline, 'Coords')
-                add_textequiv(textline)
-                points_co = ''
-                for l in range(len(all_found_texline_polygons_marginals[mm][j])):
-                    if not self.curved_line:
-                        if len(all_found_texline_polygons_marginals[mm][j][l]) == 2:
-                            points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0] + all_box_coord_marginals[mm][2] + page_coord[2]) / self.scale_x))
-                            points_co += ','
-                            points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][1] + all_box_coord_marginals[mm][0] + page_coord[0]) / self.scale_y))
-                        else:
-                            points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0][0] + all_box_coord_marginals[mm][2] + page_coord[2]) / self.scale_x))
-                            points_co += ','
-                            points_co+= str(int((all_found_texline_polygons_marginals[mm][j][l][0][1] + all_box_coord_marginals[mm][0] + page_coord[0]) / self.scale_y))
-                    else:
-                        if len(all_found_texline_polygons_marginals[mm][j][l])==2:
-                            points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0] + page_coord[2]) / self.scale_x))
-                            points_co += ','
-                            points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][1] + page_coord[0]) / self.scale_y))
-                        else:
-                            points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0][0] + page_coord[2]) / self.scale_x))
-                            points_co += ','
-                            points_co += str(int((all_found_texline_polygons_marginals[mm][j][l][0][1] + page_coord[0]) / self.scale_y))
-
-                    if l < len(all_found_texline_polygons_marginals[mm][j]) - 1:
-                        points_co = points_co+' '
-                coord.set('points',points_co)
+        for marginal_idx in range(len(found_polygons_marginals)):
+            marginal = ET.SubElement(page, 'TextRegion')
             add_textequiv(textregion)
+            marginal.set('id', id_of_marginalia[mm])
+            marginal.set('type', 'marginalia')
+            coord_text = ET.SubElement(marginal, 'Coords')
+            coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals, mm, page_coord))
+            self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, marginal_idx, page_coord, all_box_coord_marginals, id_indexer_l)
 
         id_indexer = len(found_polygons_text_region) + len(found_polygons_text_region_h) + len(found_polygons_marginals) + len(found_polygons_drop_capitals)
         for mm in range(len(found_polygons_text_region_img)):

From 4e97ca0faea68217aece3ce5ee98c802d7f5a5b3 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Wed, 24 Feb 2021 12:09:03 +0100
Subject: [PATCH 66/89] split generation of XML from writing to disk

---
 sbb_newspapers_org_image/eynollah.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index cee5220..b1ea50f 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -1257,9 +1257,13 @@ class eynollah:
             coord.set('points',points_co)
         return id_indexer_l
 
+    def write_into_page_xml(self, pcgts):
+        self.logger.info("filename stem: '%s'", self.image_filename_stem)
+        tree = ET.ElementTree(pcgts)
+        tree.write(os.path.join(self.dir_out, self.image_filename_stem) + ".xml")
 
-    def write_into_page_xml(self, found_polygons_text_region, page_coord, dir_of_image, order_of_texts, id_of_texts, all_found_texline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, curved_line, slopes, slopes_marginals):
-        self.logger.debug('enter write_into_page_xml')
+    def build_pagexml_no_full_layout(self, found_polygons_text_region, page_coord, order_of_texts, id_of_texts, all_found_texline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, curved_line, slopes, slopes_marginals):
+        self.logger.debug('enter build_pagexml_no_full_layout')
 
         # create the file structure
         pcgts, page = create_page_xml(self.image_filename, self.height_org, self.width_org)
@@ -1305,12 +1309,10 @@ class eynollah:
                     points_co += ' '
             coord_text.set('points', points_co)
 
-        self.logger.info("filename stem: '%s'", self.image_filename_stem)
-        tree = ET.ElementTree(pcgts)
-        tree.write(os.path.join(dir_of_image, self.image_filename_stem) + ".xml")
+        return pcgts
 
-    def write_into_page_xml_full(self, found_polygons_text_region, found_polygons_text_region_h, page_coord, dir_of_image, order_of_texts, id_of_texts, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals):
-        self.logger.debug('enter write_into_page_xml_full')
+    def build_pagexml_full_layout(self, found_polygons_text_region, found_polygons_text_region_h, page_coord, order_of_texts, id_of_texts, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals):
+        self.logger.debug('enter build_pagexml_full_layout')
 
         # create the file structure
         pcgts, page = create_page_xml(self.image_filename, self.height_org, self.width_org)
@@ -1381,9 +1383,7 @@ class eynollah:
             coord_text = ET.SubElement(textregion, 'Coords')
             coord_text.set('points', self.calculate_polygon_coords(found_polygons_tables, mm, page_coord))
 
-        self.logger.info("filename stem: '%s'", self.image_filename_stem)
-        tree = ET.ElementTree(pcgts)
-        tree.write(os.path.join(dir_of_image, self.image_filename_stem) + ".xml")
+        return pcgts
 
     def get_regions_from_xy_2models(self,img,is_image_enhanced):
         self.logger.debug("enter get_regions_from_xy_2models")
@@ -2100,7 +2100,7 @@ class eynollah:
 
         if not num_col:
             self.logger.info("No columns detected, outputting an empty PAGE-XML")
-            self.write_into_page_xml([], page_coord, self.dir_out, [], [], [], [], [], [], [], [], self.curved_line, [], [])
+            self.write_into_page_xml(self.build_pagexml_no_full_layout([], page_coord, [], [], [], [], [], [], [], [], self.curved_line, [], []))
             self.logger.info("Job done in %ss", str(time.time() - t1))
             return
 
@@ -2329,7 +2329,7 @@ class eynollah:
             else:
                 order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered, boxes_d, textline_mask_tot_d)
 
-            self.write_into_page_xml_full(contours_only_text_parent, contours_only_text_parent_h, page_coord, self.dir_out, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, polygons_of_images, polygons_of_tabels, polygons_of_drop_capitals, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals)
+            self.write_into_page_xml(self.build_pagexml_full_layout(contours_only_text_parent, contours_only_text_parent_h, page_coord, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, polygons_of_images, polygons_of_tabels, polygons_of_drop_capitals, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals))
 
         else:
             contours_only_text_parent_h = None
@@ -2338,6 +2338,6 @@ class eynollah:
             else:
                 contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered)[index_by_text_par_con])
                 order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h, boxes_d, textline_mask_tot_d)
-            self.write_into_page_xml(txt_con_org, page_coord, self.dir_out, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, self.curved_line, slopes, slopes_marginals)
+            self.write_into_page_xml(self.build_pagexml_no_full_layout(txt_con_org, page_coord, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, self.curved_line, slopes, slopes_marginals))
 
         self.logger.info("Job done in %ss", str(time.time() - t1))

From 6637eff3e74398ace4ef4502b506ee16543d54c5 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Wed, 24 Feb 2021 12:12:02 +0100
Subject: [PATCH 67/89] eliminate curved_line argument

---
 sbb_newspapers_org_image/eynollah.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index b1ea50f..0531640 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -1262,7 +1262,7 @@ class eynollah:
         tree = ET.ElementTree(pcgts)
         tree.write(os.path.join(self.dir_out, self.image_filename_stem) + ".xml")
 
-    def build_pagexml_no_full_layout(self, found_polygons_text_region, page_coord, order_of_texts, id_of_texts, all_found_texline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, curved_line, slopes, slopes_marginals):
+    def build_pagexml_no_full_layout(self, found_polygons_text_region, page_coord, order_of_texts, id_of_texts, all_found_texline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals):
         self.logger.debug('enter build_pagexml_no_full_layout')
 
         # create the file structure
@@ -2100,7 +2100,7 @@ class eynollah:
 
         if not num_col:
             self.logger.info("No columns detected, outputting an empty PAGE-XML")
-            self.write_into_page_xml(self.build_pagexml_no_full_layout([], page_coord, [], [], [], [], [], [], [], [], self.curved_line, [], []))
+            self.write_into_page_xml(self.build_pagexml_no_full_layout([], page_coord, [], [], [], [], [], [], [], [], [], []))
             self.logger.info("Job done in %ss", str(time.time() - t1))
             return
 
@@ -2338,6 +2338,6 @@ class eynollah:
             else:
                 contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered)[index_by_text_par_con])
                 order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h, boxes_d, textline_mask_tot_d)
-            self.write_into_page_xml(self.build_pagexml_no_full_layout(txt_con_org, page_coord, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, self.curved_line, slopes, slopes_marginals))
+            self.write_into_page_xml(self.build_pagexml_no_full_layout(txt_con_org, page_coord, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals))
 
         self.logger.info("Job done in %ss", str(time.time() - t1))

From cb6bc17ce1bd530af68ec5e6f775facb03da2b1a Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Wed, 24 Feb 2021 12:33:37 +0100
Subject: [PATCH 68/89] eliminate more unused vars

---
 sbb_newspapers_org_image/eynollah.py | 25 ++++++++++---------------
 1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index 0531640..92bfb03 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -161,11 +161,10 @@ class eynollah:
 
     def predict_enhancement(self, img):
         self.logger.debug("enter predict_enhancement")
-        model_enhancement, session_enhancemnet = self.start_new_session_and_model(self.model_dir_of_enhancemnet)
+        model_enhancement, _ = self.start_new_session_and_model(self.model_dir_of_enhancemnet)
 
         img_height_model = model_enhancement.layers[len(model_enhancement.layers) - 1].output_shape[1]
         img_width_model = model_enhancement.layers[len(model_enhancement.layers) - 1].output_shape[2]
-        # n_classes = model_enhancement.layers[len(model_enhancement.layers) - 1].output_shape[3]
         if img.shape[0] < img_height_model:
             img = cv2.resize(img, (img.shape[1], img_width_model), interpolation=cv2.INTER_NEAREST)
 
@@ -180,7 +179,6 @@ class eynollah:
         img_w = img.shape[1]
 
         prediction_true = np.zeros((img_h, img_w, 3))
-        mask_true = np.zeros((img_h, img_w))
         nxf = img_w / float(width_mid)
         nyf = img_h / float(height_mid)
 
@@ -344,7 +342,7 @@ class eynollah:
         K.clear_session()
         gc.collect()
 
-        img_new, num_column_is_classified = self.calculate_width_height_by_columns(img, num_col, width_early, label_p_pred)
+        img_new, _ = self.calculate_width_height_by_columns(img, num_col, width_early, label_p_pred)
 
         if img_new.shape[1] > img.shape[1]:
             img_new = self.predict_enhancement(img_new)
@@ -355,7 +353,7 @@ class eynollah:
     def resize_and_enhance_image_with_column_classifier(self):
         self.logger.debug("enter resize_and_enhance_image_with_column_classifier")
         dpi = check_dpi(self.image_filename)
-        self.logger.info("Detected %s DPI" % dpi)
+        self.logger.info("Detected %s DPI", dpi)
         img = self.imread()
 
         _, page_coord = self.early_page_for_num_of_column_classification()
@@ -459,8 +457,6 @@ class eynollah:
 
         img_height_model = model.layers[len(model.layers) - 1].output_shape[1]
         img_width_model = model.layers[len(model.layers) - 1].output_shape[2]
-        n_classes = model.layers[len(model.layers) - 1].output_shape[3]
-
 
         if not patches:
             img_h_page = img.shape[0]
@@ -1891,7 +1887,7 @@ class eynollah:
     def run_textline(self, image_page):
         scaler_h_textline = 1  # 1.2#1.2
         scaler_w_textline = 1  # 0.9#1
-        textline_mask_tot_ea, textline_mask_tot_long_shot = self.textline_contours(image_page, True, scaler_h_textline, scaler_w_textline)
+        textline_mask_tot_ea, _ = self.textline_contours(image_page, True, scaler_h_textline, scaler_w_textline)
 
         K.clear_session()
         gc.collect()
@@ -1900,7 +1896,7 @@ class eynollah:
         # plt.show()
         if self.plotter:
             self.plotter.save_plot_of_textlines(textline_mask_tot_ea, image_page)
-        return textline_mask_tot_ea, textline_mask_tot_long_shot
+        return textline_mask_tot_ea
 
     def run_deskew(self, textline_mask_tot_ea):
         sigma = 2
@@ -2105,7 +2101,7 @@ class eynollah:
             return
 
         t1 = time.time()
-        textline_mask_tot_ea, textline_mask_tot_long_shot = self.run_textline(image_page)
+        textline_mask_tot_ea = self.run_textline(image_page)
         self.logger.info("textline detection took %ss", str(time.time() - t1))
 
         t1 = time.time()
@@ -2246,7 +2242,7 @@ class eynollah:
 
         if not self.curved_line:
             slopes, all_found_texline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con = self.get_slopes_and_deskew_new(txt_con_org, contours_only_text_parent, textline_mask_tot_ea, image_page_rotated, boxes_text, slope_deskew)
-            slopes_marginals, all_found_texline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, index_by_text_par_con_marginal = self.get_slopes_and_deskew_new(polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea, image_page_rotated, boxes_marginals, slope_deskew)
+            slopes_marginals, all_found_texline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, _ = self.get_slopes_and_deskew_new(polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea, image_page_rotated, boxes_marginals, slope_deskew)
 
         else:
             scale_param = 1
@@ -2255,7 +2251,6 @@ class eynollah:
             all_found_texline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, index_by_text_par_con_marginal, slopes_marginals = self.get_slopes_and_deskew_new_curved(polygons_of_marginals, polygons_of_marginals, cv2.erode(textline_mask_tot_ea, kernel=self.kernel, iterations=1), image_page_rotated, boxes_marginals, text_only, num_col_classifier, scale_param, slope_deskew)
             all_found_texline_polygons_marginals = small_textlines_to_parent_adherence2(all_found_texline_polygons_marginals, textline_mask_tot_ea, num_col_classifier)
         index_of_vertical_text_contours = np.array(range(len(slopes)))[(abs(np.array(slopes)) > 60)]
-        contours_text_vertical = [contours_only_text_parent[i] for i in index_of_vertical_text_contours]
 
         K.clear_session()
         gc.collect()
@@ -2264,7 +2259,7 @@ class eynollah:
         if self.full_layout:
             if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
                 contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered)[index_by_text_par_con])
-                text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_texline_polygons, all_found_texline_polygons_h, slopes, slopes_h, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_texline_polygons, slopes, contours_only_text_parent_d_ordered)
+                text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_texline_polygons, all_found_texline_polygons_h, slopes, _, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_texline_polygons, slopes, contours_only_text_parent_d_ordered)
             else:
                 contours_only_text_parent_d_ordered = None
                 text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_texline_polygons, all_found_texline_polygons_h, slopes, slopes_h, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_texline_polygons, slopes, contours_only_text_parent_d_ordered)
@@ -2286,9 +2281,9 @@ class eynollah:
 
             if not self.headers_off:
                 if np.abs(slope_deskew) < SLOPE_THRESHOLD:
-                    num_col, peaks_neg_fin, matrix_of_lines_ch, spliter_y_new, _ = find_number_of_columns_in_document(np.repeat(text_regions_p[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines, contours_only_text_parent_h)
+                    num_col, _, matrix_of_lines_ch, spliter_y_new, _ = find_number_of_columns_in_document(np.repeat(text_regions_p[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines, contours_only_text_parent_h)
                 else:
-                    num_col_d, peaks_neg_fin_d, matrix_of_lines_ch_d, spliter_y_new_d, _ = find_number_of_columns_in_document(np.repeat(text_regions_p_1_n[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines, contours_only_text_parent_h_d_ordered)
+                    _, _, matrix_of_lines_ch_d, spliter_y_new_d, _ = find_number_of_columns_in_document(np.repeat(text_regions_p_1_n[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines, contours_only_text_parent_h_d_ordered)
             elif self.headers_off:
                 if np.abs(slope_deskew) < SLOPE_THRESHOLD:
                     num_col, peaks_neg_fin, matrix_of_lines_ch, spliter_y_new, _ = find_number_of_columns_in_document(np.repeat(text_regions_p[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines)

From 60208a46f0783a2445b15d11cc6c6b936cbdde8e Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Wed, 24 Feb 2021 12:53:49 +0100
Subject: [PATCH 69/89] remove unnecessary del/gc.collect, eliminate
 unnecessary constructs

---
 sbb_newspapers_org_image/eynollah.py | 156 ++-------------------------
 1 file changed, 11 insertions(+), 145 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index 92bfb03..71b4752 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -336,11 +336,8 @@ class eynollah:
         self.logger.info("Found %s columns (%s)", num_col, label_p_pred)
 
         session_col_classifier.close()
-        del model_num_classifier
-        del session_col_classifier
 
         K.clear_session()
-        gc.collect()
 
         img_new, _ = self.calculate_width_height_by_columns(img, num_col, width_early, label_p_pred)
 
@@ -385,13 +382,7 @@ class eynollah:
         self.logger.info("Found %s columns (%s)", num_col, label_p_pred)
 
         session_col_classifier.close()
-        del model_num_classifier
-        del session_col_classifier
-        del img_in
-        del img_1ch
-        del page_coord
         K.clear_session()
-        gc.collect()
 
         if dpi < 298:
             img_new, num_column_is_classified = self.calculate_width_height_by_columns(img, num_col, width_early, label_p_pred)
@@ -439,8 +430,6 @@ class eynollah:
         self.scale_x = img_res.shape[1] / float(self.image_org.shape[1])
         
 
-        del img_org
-        del img_res
 
     def start_new_session_and_model(self, model_dir):
         self.logger.debug("enter start_new_session_and_model (model_dir=%s)", model_dir)
@@ -471,10 +460,6 @@ class eynollah:
             prediction_true = resize_image(seg_color, img_h_page, img_w_page)
             prediction_true = prediction_true.astype(np.uint8)
 
-            del img
-            del seg_color
-            del label_p_pred
-            del seg
 
         else:
             if img.shape[0] < img_height_model:
@@ -571,12 +556,6 @@ class eynollah:
                         prediction_true[index_y_d + margin : index_y_u - margin, index_x_d + margin : index_x_u - margin, :] = seg_color
 
             prediction_true = prediction_true.astype(np.uint8)
-            del img
-            del mask_true
-            del seg_color
-            del seg
-            del img_patch
-        gc.collect()
         return prediction_true
 
     def early_page_for_num_of_column_classification(self):
@@ -598,22 +577,7 @@ class eynollah:
         box = [x, y, w, h]
         croped_page, page_coord = crop_image_inside_box(box, img)
         session_page.close()
-        del model_page
-        del session_page
-        del contours
-        del thresh
-        del img
-        del cnt_size
-        del cnt
-        del box
-        del x
-        del y
-        del w
-        del h
-        del imgray
-        del img_page_prediction
 
-        gc.collect()
         self.logger.debug("exit early_page_for_num_of_column_classification")
         return croped_page, page_coord
 
@@ -652,15 +616,8 @@ class eynollah:
         self.cont_page.append(np.array([[page_coord[2], page_coord[0]], [page_coord[3], page_coord[0]], [page_coord[3], page_coord[1]], [page_coord[2], page_coord[1]]]))
 
         session_page.close()
-        del model_page
-        del session_page
-        del contours
-        del thresh
-        del img
-        del imgray
 
         K.clear_session()
-        gc.collect()
         self.logger.debug("exit extract_page")
         return croped_page, page_coord
 
@@ -757,10 +714,6 @@ class eynollah:
         prediction_regions = resize_image(prediction_regions, img_height_h, img_width_h)
 
         session_region.close()
-        del model_region
-        del session_region
-        del img
-        gc.collect()
         self.logger.debug("exit extract_text_regions")
         return prediction_regions, prediction_regions2
 
@@ -1069,12 +1022,7 @@ class eynollah:
         ##plt.show()
 
         session_textline.close()
-        del model_textline
-        del session_textline
-        del img
-        del img_org
 
-        gc.collect()
         return prediction_textline[:, :, 0], prediction_textline_longshot_true_size[:, :, 0]
 
     def do_work_of_slopes(self, q, poly, box_sub, boxes_per_process, textline_mask_tot, contours_per_process):
@@ -1389,47 +1337,20 @@ class eynollah:
 
         model_region, session_region = self.start_new_session_and_model(self.model_region_dir_p_ens)
 
-        gaussian_filter=False
-        binary=False
         ratio_y=1.3
         ratio_x=1
-        median_blur=False
 
         img = resize_image(img_org, int(img_org.shape[0]*ratio_y), int(img_org.shape[1]*ratio_x))
 
-        if binary:
-            img = otsu_copy_binary(img)
-            img = img.astype(np.uint16)
-        if median_blur:
-            img = cv2.medianBlur(img,5)
-        if gaussian_filter:
-            img= cv2.GaussianBlur(img,(5,5),0)
-            img = img.astype(np.uint16)
-
         prediction_regions_org_y = self.do_prediction(True, img, model_region)
         prediction_regions_org_y = resize_image(prediction_regions_org_y, img_height_h, img_width_h )
 
         #plt.imshow(prediction_regions_org_y[:,:,0])
         #plt.show()
-        prediction_regions_org_y=prediction_regions_org_y[:,:,0]
-        mask_zeros_y=(prediction_regions_org_y[:,:]==0)*1
-        if is_image_enhanced:
-            ratio_x = 1.2
-        else:
-            ratio_x = 1
-        ratio_y = 1
-        median_blur=False
+        prediction_regions_org_y = prediction_regions_org_y[:,:,0]
+        mask_zeros_y = (prediction_regions_org_y[:,:]==0)*1
 
-        img = resize_image(img_org, int(img_org.shape[0]*ratio_y), int(img_org.shape[1]*ratio_x))
-
-        if binary:
-            img = otsu_copy_binary(img)#self.otsu_copy(img)
-            img = img.astype(np.uint16)
-        if median_blur:
-            img = cv2.medianBlur(img, 5)
-        if gaussian_filter:
-            img = cv2.GaussianBlur(img, (5,5 ), 0)
-            img = img.astype(np.uint16)
+        img = resize_image(img_org, int(img_org.shape[0]), int(img_org.shape[1]*(1.2 if is_image_enhanced else 1)))
 
         prediction_regions_org = self.do_prediction(True, img, model_region)
         prediction_regions_org = resize_image(prediction_regions_org, img_height_h, img_width_h )
@@ -1437,36 +1358,12 @@ class eynollah:
         ##plt.imshow(prediction_regions_org[:,:,0])
         ##plt.show()
         prediction_regions_org=prediction_regions_org[:,:,0]
-
         prediction_regions_org[(prediction_regions_org[:,:]==1) & (mask_zeros_y[:,:]==1)]=0
         session_region.close()
-        del model_region
-        del session_region
-        gc.collect()
 
         model_region, session_region = self.start_new_session_and_model(self.model_region_dir_p2)
-
-        gaussian_filter=False
-        binary=False
-        ratio_x=1
-        ratio_y=1
-        median_blur=False
-
-        img= resize_image(img_org, int(img_org.shape[0]*ratio_y), int(img_org.shape[1]*ratio_x))
-
-        if binary:
-            img = otsu_copy_binary(img)#self.otsu_copy(img)
-            img = img.astype(np.uint16)
-
-        if median_blur:
-            img=cv2.medianBlur(img,5)
-        if gaussian_filter:
-            img= cv2.GaussianBlur(img,(5,5),0)
-            img = img.astype(np.uint16)
-
-        marginal_patch=0.2
-        prediction_regions_org2=self.do_prediction(True, img, model_region, marginal_patch)
-
+        img = resize_image(img_org, int(img_org.shape[0]), int(img_org.shape[1]))
+        prediction_regions_org2 = self.do_prediction(True, img, model_region, 0.2)
         prediction_regions_org2=resize_image(prediction_regions_org2, img_height_h, img_width_h )
 
         #plt.imshow(prediction_regions_org2[:,:,0])
@@ -1474,22 +1371,13 @@ class eynollah:
         ##prediction_regions_org=prediction_regions_org[:,:,0]
 
         session_region.close()
-        del model_region
-        del session_region
-        gc.collect()
 
-        mask_zeros2=(prediction_regions_org2[:,:,0]==0)*1
-        mask_lines2=(prediction_regions_org2[:,:,0]==3)*1
-
-        text_sume_early=( (prediction_regions_org[:,:]==1)*1 ).sum()
-
-
-        prediction_regions_org_copy=np.copy(prediction_regions_org)
-
-
-        prediction_regions_org_copy[(prediction_regions_org_copy[:,:]==1) & (mask_zeros2[:,:]==1)]=0
-
-        text_sume_second=( (prediction_regions_org_copy[:,:]==1)*1 ).sum()
+        mask_zeros2 = (prediction_regions_org2[:,:,0] == 0)
+        mask_lines2 = (prediction_regions_org2[:,:,0] == 3)
+        text_sume_early = (prediction_regions_org[:,:] == 1).sum()
+        prediction_regions_org_copy = np.copy(prediction_regions_org)
+        prediction_regions_org_copy[(prediction_regions_org_copy[:,:]==1) & (mask_zeros2[:,:]==1)] = 0
+        text_sume_second = ((prediction_regions_org_copy[:,:]==1)*1).sum()
 
         rate_two_models=text_sume_second/float(text_sume_early)*100
 
@@ -1501,9 +1389,6 @@ class eynollah:
         prediction_regions_org[(mask_lines2[:,:]==1) & (prediction_regions_org[:,:]==0)]=3
 
 
-        del mask_lines2
-        del mask_zeros2
-        del prediction_regions_org2
 
         mask_lines_only=(prediction_regions_org[:,:]==3)*1
 
@@ -1528,17 +1413,7 @@ class eynollah:
 
         text_regions_p_true=cv2.fillPoly(text_regions_p_true,pts=polygons_of_only_texts, color=(1,1,1))
 
-        del polygons_of_only_texts
-        del polygons_of_only_images
-        del polygons_of_only_lines
-        del mask_images_only
-        del prediction_regions_org
-        del img
-        del mask_zeros_y
 
-        del prediction_regions_org_y
-        del img_org
-        gc.collect()
 
         K.clear_session()
         return text_regions_p_true
@@ -1890,7 +1765,6 @@ class eynollah:
         textline_mask_tot_ea, _ = self.textline_contours(image_page, True, scaler_h_textline, scaler_w_textline)
 
         K.clear_session()
-        gc.collect()
         #print(np.unique(textline_mask_tot_ea[:, :]), "textline")
         # plt.imshow(textline_mask_tot_ea)
         # plt.show()
@@ -1954,7 +1828,6 @@ class eynollah:
         if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
             num_col_d, peaks_neg_fin_d, matrix_of_lines_ch_d, spliter_y_new_d, seperators_closeup_n_d = find_number_of_columns_in_document(np.repeat(text_regions_p_1_n[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines)
         K.clear_session()
-        gc.collect()
 
         self.logger.info("num_col_classifier: %s", num_col_classifier)
 
@@ -2002,7 +1875,6 @@ class eynollah:
         text_regions_p[:, :][text_regions_p[:, :] == 4] = 8
 
         K.clear_session()
-        # gc.collect()
         image_page = image_page.astype(np.uint8)
 
         # print(type(image_page))
@@ -2012,7 +1884,6 @@ class eynollah:
         regions_fully_only_drop = put_drop_out_from_only_drop_model(regions_fully_only_drop, text_regions_p)
         regions_fully[:, :, 0][regions_fully_only_drop[:, :, 0] == 4] = 4
         K.clear_session()
-        gc.collect()
 
         # plt.imshow(regions_fully[:,:,0])
         # plt.show()
@@ -2023,7 +1894,6 @@ class eynollah:
         # plt.show()
 
         K.clear_session()
-        gc.collect()
         regions_fully_np, _ = self.extract_text_regions(image_page, False, cols=num_col_classifier)
 
         # plt.imshow(regions_fully_np[:,:,0])
@@ -2038,7 +1908,6 @@ class eynollah:
         # plt.show()
 
         K.clear_session()
-        gc.collect()
 
         # plt.imshow(regions_fully[:,:,0])
         # plt.show()
@@ -2069,7 +1938,6 @@ class eynollah:
         regions_without_seperators = (text_regions_p[:, :] == 1) * 1  # ( (text_regions_p[:,:]==1) | (text_regions_p[:,:]==2) )*1 #self.return_regions_without_seperators_new(text_regions_p[:,:,0],img_only_regions)
 
         K.clear_session()
-        gc.collect()
         img_revised_tab = np.copy(text_regions_p[:, :])
         polygons_of_images = return_contours_of_interested_region(img_revised_tab, 5)
         self.logger.debug('exit run_boxes_full_layout')
@@ -2253,7 +2121,6 @@ class eynollah:
         index_of_vertical_text_contours = np.array(range(len(slopes)))[(abs(np.array(slopes)) > 60)]
 
         K.clear_session()
-        gc.collect()
         # print(index_by_text_par_con,'index_by_text_par_con')
 
         if self.full_layout:
@@ -2269,7 +2136,6 @@ class eynollah:
                 self.plotter.save_plot_of_layout_all(text_regions_p, image_page)
 
             K.clear_session()
-            gc.collect()
 
             polygons_of_tabels = []
             pixel_img = 4

From 697ff99bbab57ea08620712ad160933a8d2b79b3 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Wed, 24 Feb 2021 13:03:42 +0100
Subject: [PATCH 70/89] use constants for "magic numbers"

---
 sbb_newspapers_org_image/eynollah.py | 30 +++++++++++-----------------
 1 file changed, 12 insertions(+), 18 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index 71b4752..acc7c20 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -89,6 +89,8 @@ from .utils.pil_cv2 import check_dpi
 from .plot import EynollahPlotter
 
 SLOPE_THRESHOLD = 0.13
+RATIO_OF_TWO_MODEL_THRESHOLD = 95.50 #98.45:
+DPI_THRESHOLD = 298
 
 class eynollah:
     def __init__(
@@ -384,7 +386,7 @@ class eynollah:
         session_col_classifier.close()
         K.clear_session()
 
-        if dpi < 298:
+        if dpi < DPI_THRESHOLD:
             img_new, num_column_is_classified = self.calculate_width_height_by_columns(img, num_col, width_early, label_p_pred)
             image_res = self.predict_enhancement(img_new)
             is_image_enhanced = True
@@ -1379,19 +1381,14 @@ class eynollah:
         prediction_regions_org_copy[(prediction_regions_org_copy[:,:]==1) & (mask_zeros2[:,:]==1)] = 0
         text_sume_second = ((prediction_regions_org_copy[:,:]==1)*1).sum()
 
-        rate_two_models=text_sume_second/float(text_sume_early)*100
+        rate_two_models = text_sume_second / float(text_sume_early) * 100
 
         self.logger.info("ratio_of_two_models: %s", rate_two_models)
-        if not(is_image_enhanced and rate_two_models<95.50):#98.45:
-            prediction_regions_org=np.copy(prediction_regions_org_copy)
+        if not(is_image_enhanced and rate_two_models < RATIO_OF_TWO_MODEL_THRESHOLD):
+            prediction_regions_org = np.copy(prediction_regions_org_copy)
 
-        ##prediction_regions_org[mask_lines2[:,:]==1]=3
         prediction_regions_org[(mask_lines2[:,:]==1) & (prediction_regions_org[:,:]==0)]=3
-
-
-
         mask_lines_only=(prediction_regions_org[:,:]==3)*1
-
         prediction_regions_org = cv2.erode(prediction_regions_org[:,:], self.kernel, iterations=2)
 
         #plt.imshow(text_region2_1st_channel)
@@ -1401,15 +1398,13 @@ class eynollah:
         mask_texts_only=(prediction_regions_org[:,:]==1)*1
         mask_images_only=(prediction_regions_org[:,:]==2)*1
 
-        pixel_img=1
-        min_area_text=0.00001
-        polygons_of_only_texts=return_contours_of_interested_region(mask_texts_only,pixel_img,min_area_text)
-        polygons_of_only_images=return_contours_of_interested_region(mask_images_only,pixel_img)
-        polygons_of_only_lines=return_contours_of_interested_region(mask_lines_only,pixel_img,min_area_text)
+        polygons_of_only_texts = return_contours_of_interested_region(mask_texts_only, 1, 0.00001)
+        polygons_of_only_images = return_contours_of_interested_region(mask_images_only, 1)
+        polygons_of_only_lines = return_contours_of_interested_region(mask_lines_only, 1, 0.00001)
 
-        text_regions_p_true=np.zeros(prediction_regions_org.shape)
-        text_regions_p_true=cv2.fillPoly(text_regions_p_true,pts=polygons_of_only_lines, color=(3,3,3))
-        text_regions_p_true[:,:][mask_images_only[:,:]==1]=2
+        text_regions_p_true = np.zeros(prediction_regions_org.shape)
+        text_regions_p_true = cv2.fillPoly(text_regions_p_true,pts = polygons_of_only_lines, color=(3, 3, 3))
+        text_regions_p_true[:,:][mask_images_only[:,:] == 1] = 2
 
         text_regions_p_true=cv2.fillPoly(text_regions_p_true,pts=polygons_of_only_texts, color=(1,1,1))
 
@@ -1431,7 +1426,6 @@ class eynollah:
                         arg_text_con.append(jj)
                         break
             args_contours = np.array(range(len(arg_text_con)))
-
             arg_text_con_h = []
             for ii in range(len(cx_text_only_h)):
                 for jj in range(len(boxes)):

From 9dcb696e9967c35b88aa1a68b852e9e56e879dbb Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Wed, 24 Feb 2021 13:07:01 +0100
Subject: [PATCH 71/89] use enumerate instead of manual counter

---
 sbb_newspapers_org_image/eynollah.py | 39 +++++++---------------------
 1 file changed, 10 insertions(+), 29 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index acc7c20..49bc58b 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -1462,20 +1462,15 @@ class eynollah:
                 indexes_sorted_head = np.array(indexes_sorted)[np.array(kind_of_texts_sorted) == 2]
                 indexes_by_type_head = np.array(index_by_kind_sorted)[np.array(kind_of_texts_sorted) == 2]
 
-                zahler = 0
-                for mtv in args_contours_box:
+                for zahler, _ in enumerate(args_contours_box):
                     arg_order_v = indexes_sorted_main[zahler]
                     tartib = np.where(indexes_sorted == arg_order_v)[0][0]
                     order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = tartib + ref_point
-                    zahler = zahler + 1
 
-                zahler = 0
-                for mtv in args_contours_box_h:
+                for zahler, _ in enumerate(args_contours_box_h):
                     arg_order_v = indexes_sorted_head[zahler]
                     tartib = np.where(indexes_sorted == arg_order_v)[0][0]
-                    # print(indexes_sorted,np.where(indexes_sorted==arg_order_v ),arg_order_v,tartib,'inshgalla')
                     order_by_con_head[args_contours_box_h[indexes_by_type_head[zahler]]] = tartib + ref_point
-                    zahler = zahler + 1
 
                 for jji in range(len(id_of_texts)):
                     order_of_texts_tot.append(order_of_texts[jji] + ref_point)
@@ -1527,12 +1522,11 @@ class eynollah:
                 con_inter_box = []
                 con_inter_box_h = []
 
-                for i in range(len(args_contours_box)):
+                for box in args_contours_box:
+                    con_inter_box.append(contours_only_text_parent[box])
 
-                    con_inter_box.append(contours_only_text_parent[args_contours_box[i]])
-                for i in range(len(args_contours_box_h)):
-
-                    con_inter_box_h.append(contours_only_text_parent_h[args_contours_box_h[i]])
+                for box in args_contours_box_h:
+                    con_inter_box_h.append(contours_only_text_parent_h[box])
 
                 indexes_sorted, matrix_of_orders, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions(textline_mask_tot[int(boxes[iij][2]) : int(boxes[iij][3]), int(boxes[iij][0]) : int(boxes[iij][1])], con_inter_box, con_inter_box_h, boxes[iij][2])
 
@@ -1543,20 +1537,15 @@ class eynollah:
                 indexes_sorted_head = np.array(indexes_sorted)[np.array(kind_of_texts_sorted) == 2]
                 indexes_by_type_head = np.array(index_by_kind_sorted)[np.array(kind_of_texts_sorted) == 2]
 
-                zahler = 0
-                for mtv in args_contours_box:
+                for zahler, _ in enumerate(args_contours_box):
                     arg_order_v = indexes_sorted_main[zahler]
                     tartib = np.where(indexes_sorted == arg_order_v)[0][0]
                     order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = tartib + ref_point
-                    zahler = zahler + 1
 
-                zahler = 0
-                for mtv in args_contours_box_h:
+                for zahler, _ in enumerate(args_contours_box_h):
                     arg_order_v = indexes_sorted_head[zahler]
                     tartib = np.where(indexes_sorted == arg_order_v)[0][0]
-                    # print(indexes_sorted,np.where(indexes_sorted==arg_order_v ),arg_order_v,tartib,'inshgalla')
                     order_by_con_head[args_contours_box_h[indexes_by_type_head[zahler]]] = tartib + ref_point
-                    zahler = zahler + 1
 
                 for jji in range(len(id_of_texts)):
                     order_of_texts_tot.append(order_of_texts[jji] + ref_point)
@@ -1588,19 +1577,15 @@ class eynollah:
                         arg_text_con.append(jj)
                         break
             args_contours = np.array(range(len(arg_text_con)))
-
             order_by_con_main = np.zeros(len(arg_text_con))
 
             ref_point = 0
             order_of_texts_tot = []
             id_of_texts_tot = []
             for iij in range(len(boxes)):
-
                 args_contours_box = args_contours[np.array(arg_text_con) == iij]
-
                 con_inter_box = []
                 con_inter_box_h = []
-
                 for i in range(len(args_contours_box)):
                     con_inter_box.append(contours_only_text_parent[args_contours_box[i]])
 
@@ -1611,12 +1596,10 @@ class eynollah:
                 indexes_sorted_main = np.array(indexes_sorted)[np.array(kind_of_texts_sorted) == 1]
                 indexes_by_type_main = np.array(index_by_kind_sorted)[np.array(kind_of_texts_sorted) == 1]
 
-                zahler = 0
-                for mtv in args_contours_box:
+                for zahler, mtv in enumerate(args_contours_box):
                     arg_order_v = indexes_sorted_main[zahler]
                     tartib = np.where(indexes_sorted == arg_order_v)[0][0]
                     order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = tartib + ref_point
-                    zahler = zahler + 1
 
                 for jji in range(len(id_of_texts)):
                     order_of_texts_tot.append(order_of_texts[jji] + ref_point)
@@ -1664,12 +1647,10 @@ class eynollah:
                 indexes_sorted_head = np.array(indexes_sorted)[np.array(kind_of_texts_sorted) == 2]
                 indexes_by_type_head = np.array(index_by_kind_sorted)[np.array(kind_of_texts_sorted) == 2]
 
-                zahler = 0
-                for mtv in args_contours_box:
+                for zahler, mtv in enumerate(args_contours_box):
                     arg_order_v = indexes_sorted_main[zahler]
                     tartib = np.where(indexes_sorted == arg_order_v)[0][0]
                     order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = tartib + ref_point
-                    zahler = zahler + 1
 
                 for jji in range(len(id_of_texts)):
                     order_of_texts_tot.append(order_of_texts[jji] + ref_point)

From 605f583d21a7d301302dc5fcbe4707908786f28b Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Wed, 24 Feb 2021 13:32:26 +0100
Subject: [PATCH 72/89] log exceptions as errors

---
 sbb_newspapers_org_image/eynollah.py | 47 ++++++++++++++--------------
 1 file changed, 23 insertions(+), 24 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index 49bc58b..78fc523 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -862,7 +862,8 @@ class eynollah:
                     # old method
                     # slope_for_all=self.textline_contours_to_get_slope_correctly(self.all_text_region_raw[mv],denoised,contours[mv])
                     # text_patch_processed=textline_contours_postprocessing(gada)
-                except:
+                except Exception as why:
+                    self.logger.error(why)
                     slope_for_all = 999
 
                 if slope_for_all == 999:
@@ -914,10 +915,9 @@ class eynollah:
                     mask_biggest2 = resize_image(mask_biggest2, int(mask_biggest2.shape[0] * scale_par), int(mask_biggest2.shape[1] * scale_par))
                     cnt_textlines_in_image_ind = return_contours_of_interested_textline(mask_biggest2, pixel_img)
                     try:
-                        # textlines_cnt_per_region.append(cnt_textlines_in_image_ind[0]/scale_par)
                         textlines_cnt_per_region.append(cnt_textlines_in_image_ind[0])
-                    except:
-                        pass
+                    except Exception as why:
+                        self.logger.error(why)
             else:
                 add_boxes_coor_into_textlines = True
                 textlines_cnt_per_region = textline_contours_postprocessing(all_text_region_raw, slope_for_all, contours_par_per_process[mv], boxes_text[mv], add_boxes_coor_into_textlines)
@@ -973,7 +973,8 @@ class eynollah:
                     slope_for_all = return_deskew_slop(img_int_p, sigma_des, plotter=self.plotter)
                     if abs(slope_for_all) <= 0.5:
                         slope_for_all = [slope_deskew][0]
-                except:
+                except Exception as why:
+                    self.logger.error(why)
                     slope_for_all = 999
 
                 if slope_for_all == 999:
@@ -1043,16 +1044,11 @@ class eynollah:
                 textline_con, hierachy = return_contours_of_image(crop_img)
                 textline_con_fil = filter_contours_area_of_image(crop_img, textline_con, hierachy, max_area=1, min_area=0.0008)
                 y_diff_mean = find_contours_mean_y_diff(textline_con_fil)
-
-                sigma_des = int(y_diff_mean * (4.0 / 40.0))
-
-                if sigma_des < 1:
-                    sigma_des = 1
-
+                sigma_des = max(1, int(y_diff_mean * (4.0 / 40.0)))
                 crop_img[crop_img > 0] = 1
                 slope_corresponding_textregion = return_deskew_slop(crop_img, sigma_des, plotter=self.plotter)
-
-            except:
+            except Exception as why:
+                self.logger.error(why)
                 slope_corresponding_textregion = 999
 
             if slope_corresponding_textregion == 999:
@@ -1489,7 +1485,8 @@ class eynollah:
                 tartib_new = np.where(np.array(order_of_texts_tot) == iii)[0][0]
                 order_text_new.append(tartib_new)
 
-        except:
+        except Exception as why:
+            self.logger.error(why)
             arg_text_con = []
             for ii in range(len(cx_text_only)):
                 for jj in range(len(boxes)):
@@ -1615,7 +1612,8 @@ class eynollah:
                 tartib_new = np.where(np.array(order_of_texts_tot) == iii)[0][0]
                 order_text_new.append(tartib_new)
 
-        except:
+        except Exception as why:
+            self.logger.error(why)
             arg_text_con = []
             for ii in range(len(cx_text_only)):
                 for jj in range(len(boxes)):
@@ -1706,7 +1704,8 @@ class eynollah:
             num_col = num_col + 1
             if not num_column_is_classified:
                 num_col_classifier = num_col + 1
-        except:
+        except Exception as why:
+            self.logger.error(why)
             num_col = None
             peaks_neg_fin = []
         return num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, text_regions_p_1
@@ -2016,14 +2015,14 @@ class eynollah:
             cx_bigest_d_big, cy_biggest_d_big, _, _, _, _, _ = find_new_features_of_contoures([contours_biggest_d])
             cx_bigest_d, cy_biggest_d, _, _, _, _, _ = find_new_features_of_contoures(contours_only_text_parent_d)
             try:
-                cx_bigest_d_last5=cx_bigest_d[-5:]
-                cy_biggest_d_last5=cy_biggest_d[-5:]
-                dists_d = [math.sqrt((cx_bigest_big[0]-cx_bigest_d_last5[j])**2 + (cy_biggest_big[0]-cy_biggest_d_last5[j])**2) for j in range(len(cy_biggest_d_last5))]
-                ind_largest=len(cx_bigest_d)-5+np.argmin(dists_d)
-                cx_bigest_d_big[0]=cx_bigest_d[ind_largest]
-                cy_biggest_d_big[0]=cy_biggest_d[ind_largest]
-            except:
-                pass
+                cx_bigest_d_last5 = cx_bigest_d[-5:]
+                cy_biggest_d_last5 = cy_biggest_d[-5:]
+                dists_d = [math.sqrt((cx_bigest_big[0] - cx_bigest_d_last5[j]) ** 2 + (cy_biggest_big[0] - cy_biggest_d_last5[j]) ** 2) for j in range(len(cy_biggest_d_last5))]
+                ind_largest = len(cx_bigest_d) -5 + np.argmin(dists_d)
+                cx_bigest_d_big[0] = cx_bigest_d[ind_largest]
+                cy_biggest_d_big[0] = cy_biggest_d[ind_largest]
+            except Exception as why:
+                self.logger.error(why)
 
             (h, w) = text_only.shape[:2]
             center = (w // 2.0, h // 2.0)

From 254dd356663ae238dbba7adc123fc207b97eeb64 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Wed, 24 Feb 2021 13:48:20 +0100
Subject: [PATCH 73/89] MAX_SLOPE constant

---
 sbb_newspapers_org_image/eynollah.py | 37 ++++++----------------------
 1 file changed, 7 insertions(+), 30 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index 78fc523..0267f12 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -3,7 +3,6 @@
 tool to extract table form data from alto xml data
 """
 
-import gc
 import math
 import os
 import sys
@@ -91,6 +90,7 @@ from .plot import EynollahPlotter
 SLOPE_THRESHOLD = 0.13
 RATIO_OF_TWO_MODEL_THRESHOLD = 95.50 #98.45:
 DPI_THRESHOLD = 298
+MAX_SLOPE = 999
 
 class eynollah:
     def __init__(
@@ -357,19 +357,13 @@ class eynollah:
 
         _, page_coord = self.early_page_for_num_of_column_classification()
         model_num_classifier, session_col_classifier = self.start_new_session_and_model(self.model_dir_of_col_classifier)
-
         img_1ch = self.imread(grayscale=True)
-
         width_early = img_1ch.shape[1]
-
         img_1ch = img_1ch[page_coord[0] : page_coord[1], page_coord[2] : page_coord[3]]
-
         # plt.imshow(img_1ch)
         # plt.show()
         img_1ch = img_1ch / 255.0
-
         img_1ch = cv2.resize(img_1ch, (448, 448), interpolation=cv2.INTER_NEAREST)
-
         img_in = np.zeros((1, img_1ch.shape[0], img_1ch.shape[1], 3))
         img_in[0, :, :, 0] = img_1ch[:, :]
         img_in[0, :, :, 1] = img_1ch[:, :]
@@ -380,9 +374,7 @@ class eynollah:
 
         label_p_pred = model_num_classifier.predict(img_in)
         num_col = np.argmax(label_p_pred[0]) + 1
-
         self.logger.info("Found %s columns (%s)", num_col, label_p_pred)
-
         session_col_classifier.close()
         K.clear_session()
 
@@ -430,8 +422,6 @@ class eynollah:
 
         self.scale_y = img_res.shape[0] / float(self.image_org.shape[0])
         self.scale_x = img_res.shape[1] / float(self.image_org.shape[1])
-        
-
 
     def start_new_session_and_model(self, model_dir):
         self.logger.debug("enter start_new_session_and_model (model_dir=%s)", model_dir)
@@ -864,9 +854,9 @@ class eynollah:
                     # text_patch_processed=textline_contours_postprocessing(gada)
                 except Exception as why:
                     self.logger.error(why)
-                    slope_for_all = 999
+                    slope_for_all = MAX_SLOPE
 
-                if slope_for_all == 999:
+                if slope_for_all == MAX_SLOPE:
                     slope_for_all = [slope_deskew][0]
                 slopes_per_each_subprocess.append(slope_for_all)
 
@@ -975,9 +965,9 @@ class eynollah:
                         slope_for_all = [slope_deskew][0]
                 except Exception as why:
                     self.logger.error(why)
-                    slope_for_all = 999
+                    slope_for_all = MAX_SLOPE
 
-                if slope_for_all == 999:
+                if slope_for_all == MAX_SLOPE:
                     slope_for_all = [slope_deskew][0]
                 slopes_per_each_subprocess.append(slope_for_all)
                 mask_only_con_region = np.zeros(textline_mask_tot_ea.shape)
@@ -1049,9 +1039,9 @@ class eynollah:
                 slope_corresponding_textregion = return_deskew_slop(crop_img, sigma_des, plotter=self.plotter)
             except Exception as why:
                 self.logger.error(why)
-                slope_corresponding_textregion = 999
+                slope_corresponding_textregion = MAX_SLOPE
 
-            if slope_corresponding_textregion == 999:
+            if slope_corresponding_textregion == MAX_SLOPE:
                 slope_corresponding_textregion = slope_biggest
             slopes_sub.append(slope_corresponding_textregion)
 
@@ -1851,28 +1841,21 @@ class eynollah:
         K.clear_session()
         image_page = image_page.astype(np.uint8)
 
-        # print(type(image_page))
         regions_fully, regions_fully_only_drop = self.extract_text_regions(image_page, True, cols=num_col_classifier)
         text_regions_p[:,:][regions_fully[:,:,0]==6]=6
-
         regions_fully_only_drop = put_drop_out_from_only_drop_model(regions_fully_only_drop, text_regions_p)
         regions_fully[:, :, 0][regions_fully_only_drop[:, :, 0] == 4] = 4
         K.clear_session()
 
         # plt.imshow(regions_fully[:,:,0])
         # plt.show()
-
         regions_fully = putt_bb_of_drop_capitals_of_model_in_patches_in_layout(regions_fully)
-
         # plt.imshow(regions_fully[:,:,0])
         # plt.show()
-
         K.clear_session()
         regions_fully_np, _ = self.extract_text_regions(image_page, False, cols=num_col_classifier)
-
         # plt.imshow(regions_fully_np[:,:,0])
         # plt.show()
-
         if num_col_classifier > 2:
             regions_fully_np[:, :, 0][regions_fully_np[:, :, 0] == 4] = 0
         else:
@@ -1880,20 +1863,14 @@ class eynollah:
 
         # plt.imshow(regions_fully_np[:,:,0])
         # plt.show()
-
         K.clear_session()
-
         # plt.imshow(regions_fully[:,:,0])
         # plt.show()
-
         regions_fully = boosting_headers_by_longshot_region_segmentation(regions_fully, regions_fully_np, img_only_regions)
-
         # plt.imshow(regions_fully[:,:,0])
         # plt.show()
-
         text_regions_p[:, :][regions_fully[:, :, 0] == 4] = 4
         text_regions_p[:, :][regions_fully_np[:, :, 0] == 4] = 4
-
         #plt.imshow(text_regions_p)
         #plt.show()
 

From ddeb6938e5d2a3a6796825aed9e4a6e9af4de82d Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Wed, 24 Feb 2021 13:55:12 +0100
Subject: [PATCH 74/89] self.kernel -> constant KERNEL

---
 sbb_newspapers_org_image/eynollah.py | 56 +++++++++++++---------------
 1 file changed, 26 insertions(+), 30 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index 0267f12..b1028ea 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -91,6 +91,7 @@ SLOPE_THRESHOLD = 0.13
 RATIO_OF_TWO_MODEL_THRESHOLD = 95.50 #98.45:
 DPI_THRESHOLD = 298
 MAX_SLOPE = 999
+KERNEL = np.ones((5, 5), np.uint8)
 
 class eynollah:
     def __init__(
@@ -131,7 +132,6 @@ class eynollah:
         )
         self.logger = getLogger('eynollah')
         self.dir_models = dir_models
-        self.kernel = np.ones((5, 5), np.uint8)
 
         self.model_dir_of_enhancemnet = dir_models + "/model_enhancement.h5"
         self.model_dir_of_col_classifier = dir_models + "/model_scale_classifier.h5"
@@ -554,14 +554,13 @@ class eynollah:
         self.logger.debug("enter early_page_for_num_of_column_classification")
         img = self.imread()
         model_page, session_page = self.start_new_session_and_model(self.model_page_dir)
-        for ii in range(1):
-            img = cv2.GaussianBlur(img, (5, 5), 0)
+        img = cv2.GaussianBlur(img, (5, 5), 0)
 
         img_page_prediction = self.do_prediction(False, img, model_page)
 
         imgray = cv2.cvtColor(img_page_prediction, cv2.COLOR_BGR2GRAY)
         _, thresh = cv2.threshold(imgray, 0, 255, 0)
-        thresh = cv2.dilate(thresh, self.kernel, iterations=3)
+        thresh = cv2.dilate(thresh, KERNEL, iterations=3)
         contours, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
         cnt_size = np.array([cv2.contourArea(contours[j]) for j in range(len(contours))])
         cnt = contours[np.argmax(cnt_size)]
@@ -576,15 +575,14 @@ class eynollah:
     def extract_page(self):
         self.logger.debug("enter extract_page")
         model_page, session_page = self.start_new_session_and_model(self.model_page_dir)
-        for ii in range(1):
-            img = cv2.GaussianBlur(self.image, (5, 5), 0)
+        img = cv2.GaussianBlur(self.image, (5, 5), 0)
 
         img_page_prediction = self.do_prediction(False, img, model_page)
 
         imgray = cv2.cvtColor(img_page_prediction, cv2.COLOR_BGR2GRAY)
         _, thresh = cv2.threshold(imgray, 0, 255, 0)
 
-        thresh = cv2.dilate(thresh, self.kernel, iterations=3)
+        thresh = cv2.dilate(thresh, KERNEL, iterations=3)
         contours, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
 
         cnt_size = np.array([cv2.contourArea(contours[j]) for j in range(len(contours))])
@@ -830,7 +828,7 @@ class eynollah:
             all_text_region_raw = all_text_region_raw.astype(np.uint8)
             img_int_p = all_text_region_raw[:, :]
 
-            # img_int_p=cv2.erode(img_int_p,self.kernel,iterations = 2)
+            # img_int_p=cv2.erode(img_int_p,KERNEL,iterations = 2)
             # plt.imshow(img_int_p)
             # plt.show()
 
@@ -897,9 +895,9 @@ class eynollah:
                     mask_biggest2 = np.zeros(mask_texts_only.shape)
                     mask_biggest2 = cv2.fillPoly(mask_biggest2, pts=[cnt_textlines_in_image[jjjj]], color=(1, 1, 1))
                     if num_col + 1 == 1:
-                        mask_biggest2 = cv2.dilate(mask_biggest2, self.kernel, iterations=5)
+                        mask_biggest2 = cv2.dilate(mask_biggest2, KERNEL, iterations=5)
                     else:
-                        mask_biggest2 = cv2.dilate(mask_biggest2, self.kernel, iterations=4)
+                        mask_biggest2 = cv2.dilate(mask_biggest2, KERNEL, iterations=4)
 
                     pixel_img = 1
                     mask_biggest2 = resize_image(mask_biggest2, int(mask_biggest2.shape[0] * scale_par), int(mask_biggest2.shape[1] * scale_par))
@@ -941,7 +939,7 @@ class eynollah:
             all_text_region_raw=(textline_mask_tot_ea*mask_textline[:,:])[boxes_text[mv][1]:boxes_text[mv][1]+boxes_text[mv][3] , boxes_text[mv][0]:boxes_text[mv][0]+boxes_text[mv][2] ]
             all_text_region_raw=all_text_region_raw.astype(np.uint8)
             img_int_p=all_text_region_raw[:,:]#self.all_text_region_raw[mv]
-            img_int_p=cv2.erode(img_int_p,self.kernel,iterations = 2)
+            img_int_p=cv2.erode(img_int_p,KERNEL,iterations = 2)
 
             if img_int_p.shape[0]/img_int_p.shape[1]<0.1:
                 slopes_per_each_subprocess.append(0)
@@ -1025,11 +1023,9 @@ class eynollah:
         boxes_sub_new = []
         poly_sub = []
         for mv in range(len(boxes_per_process)):
-
             crop_img, _ = crop_image_inside_box(boxes_per_process[mv], np.repeat(textline_mask_tot[:, :, np.newaxis], 3, axis=2))
             crop_img = crop_img[:, :, 0]
-            crop_img = cv2.erode(crop_img, self.kernel, iterations=2)
-
+            crop_img = cv2.erode(crop_img, KERNEL, iterations=2)
             try:
                 textline_con, hierachy = return_contours_of_image(crop_img)
                 textline_con_fil = filter_contours_area_of_image(crop_img, textline_con, hierachy, max_area=1, min_area=0.0008)
@@ -1194,7 +1190,7 @@ class eynollah:
         tree = ET.ElementTree(pcgts)
         tree.write(os.path.join(self.dir_out, self.image_filename_stem) + ".xml")
 
-    def build_pagexml_no_full_layout(self, found_polygons_text_region, page_coord, order_of_texts, id_of_texts, all_found_texline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals):
+    def build_pagexml_no_full_layout(self, found_polygons_text_region, page_coord, order_of_texts, id_of_texts, all_found_texline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes):
         self.logger.debug('enter build_pagexml_no_full_layout')
 
         # create the file structure
@@ -1228,7 +1224,7 @@ class eynollah:
 
         id_indexer = len(found_polygons_text_region) + len(found_polygons_marginals)
         for mm in range(len(found_polygons_text_region_img)):
-            textregion=ET.SubElement(page, 'ImageRegion')
+            textregion = ET.SubElement(page, 'ImageRegion')
             textregion.set('id', 'r%s' % id_indexer)
             id_indexer += 1
             coord_text = ET.SubElement(textregion, 'Coords')
@@ -1243,7 +1239,7 @@ class eynollah:
 
         return pcgts
 
-    def build_pagexml_full_layout(self, found_polygons_text_region, found_polygons_text_region_h, page_coord, order_of_texts, id_of_texts, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals):
+    def build_pagexml_full_layout(self, found_polygons_text_region, found_polygons_text_region_h, page_coord, order_of_texts, id_of_texts, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes):
         self.logger.debug('enter build_pagexml_full_layout')
 
         # create the file structure
@@ -1375,12 +1371,12 @@ class eynollah:
 
         prediction_regions_org[(mask_lines2[:,:]==1) & (prediction_regions_org[:,:]==0)]=3
         mask_lines_only=(prediction_regions_org[:,:]==3)*1
-        prediction_regions_org = cv2.erode(prediction_regions_org[:,:], self.kernel, iterations=2)
+        prediction_regions_org = cv2.erode(prediction_regions_org[:,:], KERNEL, iterations=2)
 
         #plt.imshow(text_region2_1st_channel)
         #plt.show()
 
-        prediction_regions_org = cv2.dilate(prediction_regions_org[:,:], self.kernel, iterations=2)
+        prediction_regions_org = cv2.dilate(prediction_regions_org[:,:], KERNEL, iterations=2)
         mask_texts_only=(prediction_regions_org[:,:]==1)*1
         mask_images_only=(prediction_regions_org[:,:]==2)*1
 
@@ -1680,14 +1676,14 @@ class eynollah:
 
         mask_images = (text_regions_p_1[:, :] == 2) * 1
         mask_images = mask_images.astype(np.uint8)
-        mask_images = cv2.erode(mask_images[:, :], self.kernel, iterations=10)
+        mask_images = cv2.erode(mask_images[:, :], KERNEL, iterations=10)
 
         mask_lines = (text_regions_p_1[:, :] == 3) * 1
         mask_lines = mask_lines.astype(np.uint8)
 
         img_only_regions_with_sep = ((text_regions_p_1[:, :] != 3) & (text_regions_p_1[:, :] != 0)) * 1
         img_only_regions_with_sep = img_only_regions_with_sep.astype(np.uint8)
-        img_only_regions = cv2.erode(img_only_regions_with_sep[:, :], self.kernel, iterations=6)
+        img_only_regions = cv2.erode(img_only_regions_with_sep[:, :], KERNEL, iterations=6)
 
         try:
             num_col, peaks_neg_fin = find_num_col(img_only_regions, multiplier=6.0)
@@ -1739,7 +1735,7 @@ class eynollah:
     def run_deskew(self, textline_mask_tot_ea):
         sigma = 2
         main_page_deskew = True
-        slope_deskew = return_deskew_slop(cv2.erode(textline_mask_tot_ea, self.kernel, iterations=2), sigma, main_page_deskew, plotter=self.plotter)
+        slope_deskew = return_deskew_slop(cv2.erode(textline_mask_tot_ea, KERNEL, iterations=2), sigma, main_page_deskew, plotter=self.plotter)
         slope_first = 0
 
         if self.plotter:
@@ -1763,7 +1759,7 @@ class eynollah:
             try:
                 regions_without_seperators = (text_regions_p[:, :] == 1) * 1
                 regions_without_seperators = regions_without_seperators.astype(np.uint8)
-                text_regions_p = get_marginals(rotate_image(regions_without_seperators, slope_deskew), text_regions_p, num_col_classifier, slope_deskew, kernel=self.kernel)
+                text_regions_p = get_marginals(rotate_image(regions_without_seperators, slope_deskew), text_regions_p, num_col_classifier, slope_deskew, kernel=KERNEL)
             except Exception as e:
                 self.logger.error("exception %s", e)
                 pass
@@ -1798,14 +1794,14 @@ class eynollah:
         if num_col_classifier >= 3:
             if np.abs(slope_deskew) < SLOPE_THRESHOLD:
                 regions_without_seperators = regions_without_seperators.astype(np.uint8)
-                regions_without_seperators = cv2.erode(regions_without_seperators[:, :], self.kernel, iterations=6)
+                regions_without_seperators = cv2.erode(regions_without_seperators[:, :], KERNEL, iterations=6)
                 #random_pixels_for_image = np.random.randn(regions_without_seperators.shape[0], regions_without_seperators.shape[1])
                 #random_pixels_for_image[random_pixels_for_image < -0.5] = 0
                 #random_pixels_for_image[random_pixels_for_image != 0] = 1
                 #regions_without_seperators[(random_pixels_for_image[:, :] == 1) & (text_regions_p[:, :] == 2)] = 1
             else:
                 regions_without_seperators_d = regions_without_seperators_d.astype(np.uint8)
-                regions_without_seperators_d = cv2.erode(regions_without_seperators_d[:, :], self.kernel, iterations=6)
+                regions_without_seperators_d = cv2.erode(regions_without_seperators_d[:, :], KERNEL, iterations=6)
                 #random_pixels_for_image = np.random.randn(regions_without_seperators_d.shape[0], regions_without_seperators_d.shape[1])
                 #random_pixels_for_image[random_pixels_for_image < -0.5] = 0
                 #random_pixels_for_image[random_pixels_for_image != 0] = 1
@@ -2065,9 +2061,9 @@ class eynollah:
 
         else:
             scale_param = 1
-            all_found_texline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con, slopes = self.get_slopes_and_deskew_new_curved(txt_con_org, contours_only_text_parent, cv2.erode(textline_mask_tot_ea, kernel=self.kernel, iterations=1), image_page_rotated, boxes_text, text_only, num_col_classifier, scale_param, slope_deskew)
+            all_found_texline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con, slopes = self.get_slopes_and_deskew_new_curved(txt_con_org, contours_only_text_parent, cv2.erode(textline_mask_tot_ea, kernel=KERNEL, iterations=1), image_page_rotated, boxes_text, text_only, num_col_classifier, scale_param, slope_deskew)
             all_found_texline_polygons = small_textlines_to_parent_adherence2(all_found_texline_polygons, textline_mask_tot_ea, num_col_classifier)
-            all_found_texline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, index_by_text_par_con_marginal, slopes_marginals = self.get_slopes_and_deskew_new_curved(polygons_of_marginals, polygons_of_marginals, cv2.erode(textline_mask_tot_ea, kernel=self.kernel, iterations=1), image_page_rotated, boxes_marginals, text_only, num_col_classifier, scale_param, slope_deskew)
+            all_found_texline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, index_by_text_par_con_marginal, slopes_marginals = self.get_slopes_and_deskew_new_curved(polygons_of_marginals, polygons_of_marginals, cv2.erode(textline_mask_tot_ea, kernel=KERNEL, iterations=1), image_page_rotated, boxes_marginals, text_only, num_col_classifier, scale_param, slope_deskew)
             all_found_texline_polygons_marginals = small_textlines_to_parent_adherence2(all_found_texline_polygons_marginals, textline_mask_tot_ea, num_col_classifier)
         index_of_vertical_text_contours = np.array(range(len(slopes)))[(abs(np.array(slopes)) > 60)]
 
@@ -2091,7 +2087,7 @@ class eynollah:
             polygons_of_tabels = []
             pixel_img = 4
             polygons_of_drop_capitals = return_contours_of_interested_region_by_min_size(text_regions_p, pixel_img)
-            all_found_texline_polygons = adhere_drop_capital_region_into_cprresponding_textline(text_regions_p, polygons_of_drop_capitals, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_texline_polygons, all_found_texline_polygons_h, kernel=self.kernel, curved_line=self.curved_line)
+            all_found_texline_polygons = adhere_drop_capital_region_into_cprresponding_textline(text_regions_p, polygons_of_drop_capitals, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_texline_polygons, all_found_texline_polygons_h, kernel=KERNEL, curved_line=self.curved_line)
 
             # print(len(contours_only_text_parent_h),len(contours_only_text_parent_h_d_ordered),'contours_only_text_parent_h')
             pixel_lines = 6
@@ -2114,14 +2110,14 @@ class eynollah:
             if num_col_classifier >= 3:
                 if np.abs(slope_deskew) < SLOPE_THRESHOLD:
                     regions_without_seperators = regions_without_seperators.astype(np.uint8)
-                    regions_without_seperators = cv2.erode(regions_without_seperators[:, :], self.kernel, iterations=6)
+                    regions_without_seperators = cv2.erode(regions_without_seperators[:, :], KERNEL, iterations=6)
                     random_pixels_for_image = np.random.randn(regions_without_seperators.shape[0], regions_without_seperators.shape[1])
                     random_pixels_for_image[random_pixels_for_image < -0.5] = 0
                     random_pixels_for_image[random_pixels_for_image != 0] = 1
                     regions_without_seperators[(random_pixels_for_image[:, :] == 1) & (text_regions_p[:, :] == 5)] = 1
                 else:
                     regions_without_seperators_d = regions_without_seperators_d.astype(np.uint8)
-                    regions_without_seperators_d = cv2.erode(regions_without_seperators_d[:, :], self.kernel, iterations=6)
+                    regions_without_seperators_d = cv2.erode(regions_without_seperators_d[:, :], KERNEL, iterations=6)
                     random_pixels_for_image = np.random.randn(regions_without_seperators_d.shape[0], regions_without_seperators_d.shape[1])
                     random_pixels_for_image[random_pixels_for_image < -0.5] = 0
                     random_pixels_for_image[random_pixels_for_image != 0] = 1

From 6398579a72549ee1090989cd089abdac27fbd039 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Wed, 24 Feb 2021 14:42:15 +0100
Subject: [PATCH 75/89] simplify calculate_page_coords

---
 sbb_newspapers_org_image/eynollah.py | 27 +++++++++------------------
 1 file changed, 9 insertions(+), 18 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index b1028ea..619642e 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -576,19 +576,14 @@ class eynollah:
         self.logger.debug("enter extract_page")
         model_page, session_page = self.start_new_session_and_model(self.model_page_dir)
         img = cv2.GaussianBlur(self.image, (5, 5), 0)
-
         img_page_prediction = self.do_prediction(False, img, model_page)
-
         imgray = cv2.cvtColor(img_page_prediction, cv2.COLOR_BGR2GRAY)
         _, thresh = cv2.threshold(imgray, 0, 255, 0)
-
         thresh = cv2.dilate(thresh, KERNEL, iterations=3)
         contours, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
-
         cnt_size = np.array([cv2.contourArea(contours[j]) for j in range(len(contours))])
         cnt = contours[np.argmax(cnt_size)]
         x, y, w, h = cv2.boundingRect(cnt)
-
         if x <= 30:
             w += x
             x = 0
@@ -602,9 +597,7 @@ class eynollah:
 
         box = [x, y, w, h]
         croped_page, page_coord = crop_image_inside_box(box, self.image)
-
         self.cont_page.append(np.array([[page_coord[2], page_coord[0]], [page_coord[3], page_coord[0]], [page_coord[3], page_coord[1]], [page_coord[2], page_coord[1]]]))
-
         session_page.close()
 
         K.clear_session()
@@ -1071,19 +1064,17 @@ class eynollah:
     def calculate_page_coords(self):
         self.logger.debug('enter calculate_page_coords')
         points_page_print = ""
-        for lmm in range(len(self.cont_page[0])):
-            if len(self.cont_page[0][lmm]) == 2:
-                points_page_print += str(int((self.cont_page[0][lmm][0] ) / self.scale_x))
+        for lmm, contour in enumerate(self.cont_page[0]):
+            if len(contour) == 2:
+                points_page_print += str(int((contour[0]) / self.scale_x))
                 points_page_print += ','
-                points_page_print += str(int((self.cont_page[0][lmm][1] ) / self.scale_y))
+                points_page_print += str(int((contour[1]) / self.scale_y))
             else:
-                points_page_print += str(int((self.cont_page[0][lmm][0][0]) / self.scale_x))
+                points_page_print += str(int((contour[0][0]) / self.scale_x))
                 points_page_print += ','
-                points_page_print += str(int((self.cont_page[0][lmm][0][1] ) / self.scale_y))
-
-            if lmm < len( self.cont_page[0] ) - 1:
-                points_page_print = points_page_print + ' '
-        return points_page_print
+                points_page_print += str(int((contour[0][1] ) / self.scale_y))
+            points_page_print = points_page_print + ' '
+        return points_page_print[:-1]
 
     def xml_reading_order(self, page, order_of_texts, id_of_texts, id_of_marginalia, found_polygons_marginals):
         """
@@ -1098,7 +1089,7 @@ class eynollah:
             name = ET.SubElement(region_order_sub, 'RegionRefIndexed')
             name.set('index', str(indexer_region))
             name.set('regionRef', id_of_texts[vj])
-            indexer_region+=1
+            indexer_region += 1
         for vm in range(len(found_polygons_marginals)):
             id_of_marginalia.append('r%s' % indexer_region)
             name = "coord_text_%s" % indexer_region

From 2952a1ca1399001b0d2e903276e7a7222736bd82 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Wed, 24 Feb 2021 15:06:28 +0100
Subject: [PATCH 76/89] remove more dead code

---
 sbb_newspapers_org_image/eynollah.py       | 73 +++++++---------------
 sbb_newspapers_org_image/unused.py         | 19 ++++++
 sbb_newspapers_org_image/utils/__init__.py | 18 ------
 3 files changed, 42 insertions(+), 68 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index 619642e..94d8d73 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -1,4 +1,5 @@
 # pylint: disable=no-member,invalid-name,line-too-long,missing-function-docstring
+# pylint: disable=too-many-locals,wrong-import-position,too-many-lines
 """
 tool to extract table form data from alto xml data
 """
@@ -37,7 +38,6 @@ from .utils.contour import (
     return_contours_of_interested_region_by_min_size,
     return_contours_of_interested_textline,
     return_parent_contours,
-    return_contours_of_interested_region_by_size,
 )
 
 from .utils.rotate import (
@@ -65,7 +65,6 @@ from .utils import (
     boosting_headers_by_longshot_region_segmentation,
     crop_image_inside_box,
     find_num_col,
-    otsu_copy,
     otsu_copy_binary,
     delete_seperator_around,
     return_regions_without_seperators,
@@ -77,8 +76,6 @@ from .utils import (
     order_of_regions,
     implent_law_head_main_not_parallel,
     return_hor_spliter_by_index,
-    combine_hor_lines_and_delete_cross_points_and_get_lines_features_back_new,
-    return_points_with_boundies,
     find_number_of_columns_in_document,
     return_boxes_of_images_by_order_of_reading_new,
 )
@@ -668,7 +665,7 @@ class eynollah:
                     img = img.astype(np.uint8)
                     img= resize_image(img, int(img_height_h * 3700 / float(img_width_h)), 3700)
                 else:
-                    img = otsu_copy_binary(img)#self.otsu_copy(img)
+                    img = otsu_copy_binary(img)
                     img = img.astype(np.uint8)
                     img= resize_image(img, int(img_height_h * 0.9), int(img_width_h * 0.9))
 
@@ -852,7 +849,7 @@ class eynollah:
                 slopes_per_each_subprocess.append(slope_for_all)
 
             index_by_text_region_contours.append(indexes_r_con_per_pro[mv])
-            crop_img, crop_coor = crop_image_inside_box(boxes_text[mv], image_page_rotated)
+            _, crop_coor = crop_image_inside_box(boxes_text[mv], image_page_rotated)
 
             if abs(slope_for_all) < 45:
                 # all_box_coord.append(crop_coor)
@@ -925,11 +922,10 @@ class eynollah:
         index_by_text_region_contours = []
 
         for mv in range(len(boxes_text)):
-            crop_img,crop_coor=crop_image_inside_box(boxes_text[mv],image_page_rotated)
-            mask_textline=np.zeros((textline_mask_tot_ea.shape))
-            mask_textline=cv2.fillPoly(mask_textline,pts=[contours_per_process[mv]],color=(1,1,1))
-            denoised=None
-            all_text_region_raw=(textline_mask_tot_ea*mask_textline[:,:])[boxes_text[mv][1]:boxes_text[mv][1]+boxes_text[mv][3] , boxes_text[mv][0]:boxes_text[mv][0]+boxes_text[mv][2] ]
+            _, crop_coor = crop_image_inside_box(boxes_text[mv],image_page_rotated)
+            mask_textline = np.zeros((textline_mask_tot_ea.shape))
+            mask_textline = cv2.fillPoly(mask_textline,pts=[contours_per_process[mv]],color=(1,1,1))
+            all_text_region_raw = (textline_mask_tot_ea*mask_textline[:,:])[boxes_text[mv][1]:boxes_text[mv][1]+boxes_text[mv][3] , boxes_text[mv][0]:boxes_text[mv][0]+boxes_text[mv][2] ]
             all_text_region_raw=all_text_region_raw.astype(np.uint8)
             img_int_p=all_text_region_raw[:,:]#self.all_text_region_raw[mv]
             img_int_p=cv2.erode(img_int_p,KERNEL,iterations = 2)
@@ -1372,7 +1368,6 @@ class eynollah:
         mask_images_only=(prediction_regions_org[:,:]==2)*1
 
         polygons_of_only_texts = return_contours_of_interested_region(mask_texts_only, 1, 0.00001)
-        polygons_of_only_images = return_contours_of_interested_region(mask_images_only, 1)
         polygons_of_only_lines = return_contours_of_interested_region(mask_lines_only, 1, 0.00001)
 
         text_regions_p_true = np.zeros(prediction_regions_org.shape)
@@ -1482,7 +1477,6 @@ class eynollah:
                     if cx_text_only_h[ii] >= boxes[jj][0] and cx_text_only_h[ii] < boxes[jj][1] and cy_text_only_h[ii] >= boxes[jj][2] and cy_text_only_h[ii] < boxes[jj][3]:  # this is valid if the center of region identify in which box it is located
                         arg_text_con_h.append(jj)
                         break
-            arg_arg_text_con_h = np.argsort(arg_text_con_h)
             args_contours_h = np.array(range(len(arg_text_con_h)))
 
             order_by_con_head = np.zeros(len(arg_text_con_h))
@@ -1490,7 +1484,7 @@ class eynollah:
             ref_point = 0
             order_of_texts_tot = []
             id_of_texts_tot = []
-            for iij in range(len(boxes)):
+            for iij, _ in enumerate(boxes):
                 args_contours_box = args_contours[np.array(arg_text_con) == iij]
                 args_contours_box_h = args_contours_h[np.array(arg_text_con_h) == iij]
                 con_inter_box = []
@@ -1521,7 +1515,7 @@ class eynollah:
                     tartib = np.where(indexes_sorted == arg_order_v)[0][0]
                     order_by_con_head[args_contours_box_h[indexes_by_type_head[zahler]]] = tartib + ref_point
 
-                for jji in range(len(id_of_texts)):
+                for jji, _ in enumerate(id_of_texts):
                     order_of_texts_tot.append(order_of_texts[jji] + ref_point)
                     id_of_texts_tot.append(id_of_texts[jji])
                 ref_point = ref_point + len(id_of_texts)
@@ -1610,7 +1604,6 @@ class eynollah:
                 con_inter_box_h = []
 
                 for i in range(len(args_contours_box)):
-
                     con_inter_box.append(contours_only_text_parent[args_contours_box[i]])
 
                 indexes_sorted, matrix_of_orders, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions(textline_mask_tot[int(boxes[iij][2]) : int(boxes[iij][3]), int(boxes[iij][0]) : int(boxes[iij][1])], con_inter_box, con_inter_box_h, boxes[iij][2])
@@ -1619,15 +1612,13 @@ class eynollah:
 
                 indexes_sorted_main = np.array(indexes_sorted)[np.array(kind_of_texts_sorted) == 1]
                 indexes_by_type_main = np.array(index_by_kind_sorted)[np.array(kind_of_texts_sorted) == 1]
-                indexes_sorted_head = np.array(indexes_sorted)[np.array(kind_of_texts_sorted) == 2]
-                indexes_by_type_head = np.array(index_by_kind_sorted)[np.array(kind_of_texts_sorted) == 2]
 
                 for zahler, mtv in enumerate(args_contours_box):
                     arg_order_v = indexes_sorted_main[zahler]
                     tartib = np.where(indexes_sorted == arg_order_v)[0][0]
                     order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = tartib + ref_point
 
-                for jji in range(len(id_of_texts)):
+                for jji, _ in enumerate(id_of_texts):
                     order_of_texts_tot.append(order_of_texts[jji] + ref_point)
                     id_of_texts_tot.append(id_of_texts[jji])
                 ref_point = ref_point + len(id_of_texts)
@@ -1661,23 +1652,17 @@ class eynollah:
         if self.plotter:
             self.plotter.save_page_image(image_page)
 
-        img_g3_page = img_g3[page_coord[0] : page_coord[1], page_coord[2] : page_coord[3], :]
-
         text_regions_p_1 = text_regions_p_1[page_coord[0] : page_coord[1], page_coord[2] : page_coord[3]]
-
         mask_images = (text_regions_p_1[:, :] == 2) * 1
         mask_images = mask_images.astype(np.uint8)
         mask_images = cv2.erode(mask_images[:, :], KERNEL, iterations=10)
-
         mask_lines = (text_regions_p_1[:, :] == 3) * 1
         mask_lines = mask_lines.astype(np.uint8)
-
         img_only_regions_with_sep = ((text_regions_p_1[:, :] != 3) & (text_regions_p_1[:, :] != 0)) * 1
         img_only_regions_with_sep = img_only_regions_with_sep.astype(np.uint8)
         img_only_regions = cv2.erode(img_only_regions_with_sep[:, :], KERNEL, iterations=6)
-
         try:
-            num_col, peaks_neg_fin = find_num_col(img_only_regions, multiplier=6.0)
+            num_col, _ = find_num_col(img_only_regions, multiplier=6.0)
             num_col = num_col + 1
             if not num_column_is_classified:
                 num_col_classifier = num_col + 1
@@ -1741,7 +1726,6 @@ class eynollah:
         pixel_img = 1
         min_area = 0.00001
         max_area = 0.0006
-        textline_mask_tot_small_size = return_contours_of_interested_region_by_size(textline_mask_tot, pixel_img, min_area, max_area)
         text_regions_p_1[mask_lines[:, :] == 1] = 3
         text_regions_p = text_regions_p_1[:, :]  # long_short_region[:,:]#self.get_regions_from_2_models(image_page)
         text_regions_p = np.array(text_regions_p)
@@ -1753,7 +1737,6 @@ class eynollah:
                 text_regions_p = get_marginals(rotate_image(regions_without_seperators, slope_deskew), text_regions_p, num_col_classifier, slope_deskew, kernel=KERNEL)
             except Exception as e:
                 self.logger.error("exception %s", e)
-                pass
 
         if self.plotter:
             self.plotter.save_plot_of_layout_main_all(text_regions_p, image_page)
@@ -1763,7 +1746,7 @@ class eynollah:
     def run_boxes_no_full_layout(self, image_page, textline_mask_tot, text_regions_p, slope_deskew, num_col_classifier):
         self.logger.debug('enter run_boxes_no_full_layout')
         if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
-            image_page_rotated_n, textline_mask_tot_d, text_regions_p_1_n = rotation_not_90_func(image_page, textline_mask_tot, text_regions_p, slope_deskew)
+            _, textline_mask_tot_d, text_regions_p_1_n = rotation_not_90_func(image_page, textline_mask_tot, text_regions_p, slope_deskew)
             text_regions_p_1_n = resize_image(text_regions_p_1_n, text_regions_p.shape[0], text_regions_p.shape[1])
             textline_mask_tot_d = resize_image(textline_mask_tot_d, text_regions_p.shape[0], text_regions_p.shape[1])
             regions_without_seperators_d = (text_regions_p_1_n[:, :] == 1) * 1
@@ -1774,10 +1757,10 @@ class eynollah:
             regions_without_seperators_d = None
         pixel_lines = 3
         if np.abs(slope_deskew) < SLOPE_THRESHOLD:
-            num_col, peaks_neg_fin, matrix_of_lines_ch, spliter_y_new, seperators_closeup_n = find_number_of_columns_in_document(np.repeat(text_regions_p[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines)
+            _, _, matrix_of_lines_ch, spliter_y_new, _ = find_number_of_columns_in_document(np.repeat(text_regions_p[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines)
 
         if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
-            num_col_d, peaks_neg_fin_d, matrix_of_lines_ch_d, spliter_y_new_d, seperators_closeup_n_d = find_number_of_columns_in_document(np.repeat(text_regions_p_1_n[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines)
+            _, _, matrix_of_lines_ch_d, spliter_y_new_d, _ = find_number_of_columns_in_document(np.repeat(text_regions_p_1_n[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines)
         K.clear_session()
 
         self.logger.info("num_col_classifier: %s", num_col_classifier)
@@ -1786,18 +1769,9 @@ class eynollah:
             if np.abs(slope_deskew) < SLOPE_THRESHOLD:
                 regions_without_seperators = regions_without_seperators.astype(np.uint8)
                 regions_without_seperators = cv2.erode(regions_without_seperators[:, :], KERNEL, iterations=6)
-                #random_pixels_for_image = np.random.randn(regions_without_seperators.shape[0], regions_without_seperators.shape[1])
-                #random_pixels_for_image[random_pixels_for_image < -0.5] = 0
-                #random_pixels_for_image[random_pixels_for_image != 0] = 1
-                #regions_without_seperators[(random_pixels_for_image[:, :] == 1) & (text_regions_p[:, :] == 2)] = 1
             else:
                 regions_without_seperators_d = regions_without_seperators_d.astype(np.uint8)
                 regions_without_seperators_d = cv2.erode(regions_without_seperators_d[:, :], KERNEL, iterations=6)
-                #random_pixels_for_image = np.random.randn(regions_without_seperators_d.shape[0], regions_without_seperators_d.shape[1])
-                #random_pixels_for_image[random_pixels_for_image < -0.5] = 0
-                #random_pixels_for_image[random_pixels_for_image != 0] = 1
-                #regions_without_seperators_d[(random_pixels_for_image[:, :] == 1) & (text_regions_p_1_n[:, :] == 2)] = 1
-
         t1 = time.time()
         if np.abs(slope_deskew) < SLOPE_THRESHOLD:
             boxes = return_boxes_of_images_by_order_of_reading_new(spliter_y_new, regions_without_seperators, matrix_of_lines_ch, num_col_classifier)
@@ -1862,7 +1836,7 @@ class eynollah:
         #plt.show()
 
         if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
-            image_page_rotated_n, textline_mask_tot_d, text_regions_p_1_n, regions_fully_n = rotation_not_90_func_full_layout(image_page, textline_mask_tot, text_regions_p, regions_fully, slope_deskew)
+            _, textline_mask_tot_d, text_regions_p_1_n, regions_fully_n = rotation_not_90_func_full_layout(image_page, textline_mask_tot, text_regions_p, regions_fully, slope_deskew)
 
             text_regions_p_1_n = resize_image(text_regions_p_1_n, text_regions_p.shape[0], text_regions_p.shape[1])
             textline_mask_tot_d = resize_image(textline_mask_tot_d, text_regions_p.shape[0], text_regions_p.shape[1])
@@ -1902,7 +1876,7 @@ class eynollah:
 
         if not num_col:
             self.logger.info("No columns detected, outputting an empty PAGE-XML")
-            self.write_into_page_xml(self.build_pagexml_no_full_layout([], page_coord, [], [], [], [], [], [], [], [], [], []))
+            self.write_into_page_xml(self.build_pagexml_no_full_layout([], page_coord, [], [], [], [], [], [], [], [], []))
             self.logger.info("Job done in %ss", str(time.time() - t1))
             return
 
@@ -2048,15 +2022,14 @@ class eynollah:
 
         if not self.curved_line:
             slopes, all_found_texline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con = self.get_slopes_and_deskew_new(txt_con_org, contours_only_text_parent, textline_mask_tot_ea, image_page_rotated, boxes_text, slope_deskew)
-            slopes_marginals, all_found_texline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, _ = self.get_slopes_and_deskew_new(polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea, image_page_rotated, boxes_marginals, slope_deskew)
+            _, all_found_texline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, _ = self.get_slopes_and_deskew_new(polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea, image_page_rotated, boxes_marginals, slope_deskew)
 
         else:
             scale_param = 1
             all_found_texline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con, slopes = self.get_slopes_and_deskew_new_curved(txt_con_org, contours_only_text_parent, cv2.erode(textline_mask_tot_ea, kernel=KERNEL, iterations=1), image_page_rotated, boxes_text, text_only, num_col_classifier, scale_param, slope_deskew)
             all_found_texline_polygons = small_textlines_to_parent_adherence2(all_found_texline_polygons, textline_mask_tot_ea, num_col_classifier)
-            all_found_texline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, index_by_text_par_con_marginal, slopes_marginals = self.get_slopes_and_deskew_new_curved(polygons_of_marginals, polygons_of_marginals, cv2.erode(textline_mask_tot_ea, kernel=KERNEL, iterations=1), image_page_rotated, boxes_marginals, text_only, num_col_classifier, scale_param, slope_deskew)
+            all_found_texline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, _, slopes_marginals = self.get_slopes_and_deskew_new_curved(polygons_of_marginals, polygons_of_marginals, cv2.erode(textline_mask_tot_ea, kernel=KERNEL, iterations=1), image_page_rotated, boxes_marginals, text_only, num_col_classifier, scale_param, slope_deskew)
             all_found_texline_polygons_marginals = small_textlines_to_parent_adherence2(all_found_texline_polygons_marginals, textline_mask_tot_ea, num_col_classifier)
-        index_of_vertical_text_contours = np.array(range(len(slopes)))[(abs(np.array(slopes)) > 60)]
 
         K.clear_session()
         # print(index_by_text_par_con,'index_by_text_par_con')
@@ -2067,7 +2040,7 @@ class eynollah:
                 text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_texline_polygons, all_found_texline_polygons_h, slopes, _, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_texline_polygons, slopes, contours_only_text_parent_d_ordered)
             else:
                 contours_only_text_parent_d_ordered = None
-                text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_texline_polygons, all_found_texline_polygons_h, slopes, slopes_h, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_texline_polygons, slopes, contours_only_text_parent_d_ordered)
+                text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_texline_polygons, all_found_texline_polygons_h, slopes, _, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_texline_polygons, slopes, contours_only_text_parent_d_ordered)
 
             if self.plotter:
                 self.plotter.save_plot_of_layout(text_regions_p, image_page)
@@ -2090,9 +2063,9 @@ class eynollah:
                     _, _, matrix_of_lines_ch_d, spliter_y_new_d, _ = find_number_of_columns_in_document(np.repeat(text_regions_p_1_n[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines, contours_only_text_parent_h_d_ordered)
             elif self.headers_off:
                 if np.abs(slope_deskew) < SLOPE_THRESHOLD:
-                    num_col, peaks_neg_fin, matrix_of_lines_ch, spliter_y_new, _ = find_number_of_columns_in_document(np.repeat(text_regions_p[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines)
+                    num_col, _, matrix_of_lines_ch, spliter_y_new, _ = find_number_of_columns_in_document(np.repeat(text_regions_p[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines)
                 else:
-                    num_col_d, peaks_neg_fin_d, matrix_of_lines_ch_d, spliter_y_new_d, _ = find_number_of_columns_in_document(np.repeat(text_regions_p_1_n[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines)
+                    _, _, matrix_of_lines_ch_d, spliter_y_new_d, _ = find_number_of_columns_in_document(np.repeat(text_regions_p_1_n[:, :, np.newaxis], 3, axis=2), num_col_classifier, pixel_lines)
 
             # print(peaks_neg_fin,peaks_neg_fin_d,'num_col2')
             # print(spliter_y_new,spliter_y_new_d,'num_col_classifier')
@@ -2128,7 +2101,7 @@ class eynollah:
             else:
                 order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered, boxes_d, textline_mask_tot_d)
 
-            self.write_into_page_xml(self.build_pagexml_full_layout(contours_only_text_parent, contours_only_text_parent_h, page_coord, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, polygons_of_images, polygons_of_tabels, polygons_of_drop_capitals, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals))
+            self.write_into_page_xml(self.build_pagexml_full_layout(contours_only_text_parent, contours_only_text_parent_h, page_coord, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, polygons_of_images, polygons_of_tabels, polygons_of_drop_capitals, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes))
 
         else:
             contours_only_text_parent_h = None
@@ -2137,6 +2110,6 @@ class eynollah:
             else:
                 contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered)[index_by_text_par_con])
                 order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h, boxes_d, textline_mask_tot_d)
-            self.write_into_page_xml(self.build_pagexml_no_full_layout(txt_con_org, page_coord, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals))
+            self.write_into_page_xml(self.build_pagexml_no_full_layout(txt_con_org, page_coord, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes))
 
         self.logger.info("Job done in %ss", str(time.time() - t1))
diff --git a/sbb_newspapers_org_image/unused.py b/sbb_newspapers_org_image/unused.py
index 1981611..75f35b7 100644
--- a/sbb_newspapers_org_image/unused.py
+++ b/sbb_newspapers_org_image/unused.py
@@ -3166,3 +3166,22 @@ def return_hor_spliter_by_index_for_without_verticals(peaks_neg_fin_t, x_min_hor
         else:
             peaks_true.append(peaks_neg_fin_t[m])
     return indexer_lines, peaks_true, arg_min_hor_sort, indexer_lines_deletions_len, indexr_uniq_ind
+
+def otsu_copy(img):
+    img_r = np.zeros(img.shape)
+    img1 = img[:, :, 0]
+    img2 = img[:, :, 1]
+    img3 = img[:, :, 2]
+    # print(img.min())
+    # print(img[:,:,0].min())
+    # blur = cv2.GaussianBlur(img,(5,5))
+    # ret3,th3 = cv2.threshold(blur,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
+    retval1, threshold1 = cv2.threshold(img1, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+    retval2, threshold2 = cv2.threshold(img2, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+    retval3, threshold3 = cv2.threshold(img3, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+
+    img_r[:, :, 0] = threshold1
+    img_r[:, :, 1] = threshold1
+    img_r[:, :, 2] = threshold1
+    return img_r
+
diff --git a/sbb_newspapers_org_image/utils/__init__.py b/sbb_newspapers_org_image/utils/__init__.py
index daf6edd..c724fbe 100644
--- a/sbb_newspapers_org_image/utils/__init__.py
+++ b/sbb_newspapers_org_image/utils/__init__.py
@@ -299,24 +299,6 @@ def crop_image_inside_box(box, img_org_copy):
     image_box = img_org_copy[box[1] : box[1] + box[3], box[0] : box[0] + box[2]]
     return image_box, [box[1], box[1] + box[3], box[0], box[0] + box[2]]
 
-def otsu_copy(img):
-    img_r = np.zeros(img.shape)
-    img1 = img[:, :, 0]
-    img2 = img[:, :, 1]
-    img3 = img[:, :, 2]
-    # print(img.min())
-    # print(img[:,:,0].min())
-    # blur = cv2.GaussianBlur(img,(5,5))
-    # ret3,th3 = cv2.threshold(blur,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
-    retval1, threshold1 = cv2.threshold(img1, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
-    retval2, threshold2 = cv2.threshold(img2, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
-    retval3, threshold3 = cv2.threshold(img3, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
-
-    img_r[:, :, 0] = threshold1
-    img_r[:, :, 1] = threshold1
-    img_r[:, :, 2] = threshold1
-    return img_r
-
 def otsu_copy_binary(img):
     img_r = np.zeros((img.shape[0], img.shape[1], 3))
     img1 = img[:, :, 0]

From d7d388671d7a699480c3eeed7c571b2d7d8216f2 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Wed, 24 Feb 2021 15:15:37 +0100
Subject: [PATCH 77/89] return_hor_spliter_by_index is unused

---
 sbb_newspapers_org_image/eynollah.py       |  1 -
 sbb_newspapers_org_image/unused.py         | 67 ++++++++++++++++++++++
 sbb_newspapers_org_image/utils/__init__.py | 67 ----------------------
 3 files changed, 67 insertions(+), 68 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index 94d8d73..d1ee450 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -75,7 +75,6 @@ from .utils import (
     order_and_id_of_texts,
     order_of_regions,
     implent_law_head_main_not_parallel,
-    return_hor_spliter_by_index,
     find_number_of_columns_in_document,
     return_boxes_of_images_by_order_of_reading_new,
 )
diff --git a/sbb_newspapers_org_image/unused.py b/sbb_newspapers_org_image/unused.py
index 75f35b7..39e51a5 100644
--- a/sbb_newspapers_org_image/unused.py
+++ b/sbb_newspapers_org_image/unused.py
@@ -3185,3 +3185,70 @@ def otsu_copy(img):
     img_r[:, :, 2] = threshold1
     return img_r
 
+def return_hor_spliter_by_index(peaks_neg_fin_t, x_min_hor_some, x_max_hor_some):
+
+    arg_min_hor_sort = np.argsort(x_min_hor_some)
+    x_min_hor_some_sort = np.sort(x_min_hor_some)
+    x_max_hor_some_sort = x_max_hor_some[arg_min_hor_sort]
+
+    arg_minmax = np.array(range(len(peaks_neg_fin_t)))
+    indexer_lines = []
+    indexes_to_delete = []
+    indexer_lines_deletions_len = []
+    indexr_uniq_ind = []
+    for i in range(len(x_min_hor_some_sort)):
+        min_h = peaks_neg_fin_t - x_min_hor_some_sort[i]
+        max_h = peaks_neg_fin_t - x_max_hor_some_sort[i]
+
+        min_h[0] = min_h[0]  # +20
+        max_h[len(max_h) - 1] = max_h[len(max_h) - 1]  ##-20
+
+        min_h_neg = arg_minmax[(min_h < 0) & (np.abs(min_h) < 360)]
+        max_h_neg = arg_minmax[(max_h >= 0) & (np.abs(max_h) < 360)]
+
+        if len(min_h_neg) > 0 and len(max_h_neg) > 0:
+            deletions = list(range(min_h_neg[0] + 1, max_h_neg[0]))
+            unique_delets_int = []
+            # print(deletions,len(deletions),'delii')
+            if len(deletions) > 0:
+                # print(deletions,len(deletions),'delii2')
+
+                for j in range(len(deletions)):
+                    indexes_to_delete.append(deletions[j])
+                    # print(deletions,indexes_to_delete,'badiii')
+                    unique_delets = np.unique(indexes_to_delete)
+                    # print(min_h_neg[0],unique_delets)
+                    unique_delets_int = unique_delets[unique_delets < min_h_neg[0]]
+
+                indexer_lines_deletions_len.append(len(deletions))
+                indexr_uniq_ind.append([deletions])
+
+            else:
+                indexer_lines_deletions_len.append(0)
+                indexr_uniq_ind.append(-999)
+
+            index_line_true = min_h_neg[0] - len(unique_delets_int)
+            # print(index_line_true)
+            if index_line_true > 0 and min_h_neg[0] >= 2:
+                index_line_true = index_line_true
+            else:
+                index_line_true = min_h_neg[0]
+
+            indexer_lines.append(index_line_true)
+
+            if len(unique_delets_int) > 0:
+                for dd in range(len(unique_delets_int)):
+                    indexes_to_delete.append(unique_delets_int[dd])
+        else:
+            indexer_lines.append(-999)
+            indexer_lines_deletions_len.append(-999)
+            indexr_uniq_ind.append(-999)
+
+    peaks_true = []
+    for m in range(len(peaks_neg_fin_t)):
+        if m in indexes_to_delete:
+            pass
+        else:
+            peaks_true.append(peaks_neg_fin_t[m])
+    return indexer_lines, peaks_true, arg_min_hor_sort, indexer_lines_deletions_len, indexr_uniq_ind
+
diff --git a/sbb_newspapers_org_image/utils/__init__.py b/sbb_newspapers_org_image/utils/__init__.py
index c724fbe..aed02fc 100644
--- a/sbb_newspapers_org_image/utils/__init__.py
+++ b/sbb_newspapers_org_image/utils/__init__.py
@@ -1295,73 +1295,6 @@ def implent_law_head_main_not_parallel(text_regions):
     return text_regions
 
 
-def return_hor_spliter_by_index(peaks_neg_fin_t, x_min_hor_some, x_max_hor_some):
-
-    arg_min_hor_sort = np.argsort(x_min_hor_some)
-    x_min_hor_some_sort = np.sort(x_min_hor_some)
-    x_max_hor_some_sort = x_max_hor_some[arg_min_hor_sort]
-
-    arg_minmax = np.array(range(len(peaks_neg_fin_t)))
-    indexer_lines = []
-    indexes_to_delete = []
-    indexer_lines_deletions_len = []
-    indexr_uniq_ind = []
-    for i in range(len(x_min_hor_some_sort)):
-        min_h = peaks_neg_fin_t - x_min_hor_some_sort[i]
-        max_h = peaks_neg_fin_t - x_max_hor_some_sort[i]
-
-        min_h[0] = min_h[0]  # +20
-        max_h[len(max_h) - 1] = max_h[len(max_h) - 1]  ##-20
-
-        min_h_neg = arg_minmax[(min_h < 0) & (np.abs(min_h) < 360)]
-        max_h_neg = arg_minmax[(max_h >= 0) & (np.abs(max_h) < 360)]
-
-        if len(min_h_neg) > 0 and len(max_h_neg) > 0:
-            deletions = list(range(min_h_neg[0] + 1, max_h_neg[0]))
-            unique_delets_int = []
-            # print(deletions,len(deletions),'delii')
-            if len(deletions) > 0:
-                # print(deletions,len(deletions),'delii2')
-
-                for j in range(len(deletions)):
-                    indexes_to_delete.append(deletions[j])
-                    # print(deletions,indexes_to_delete,'badiii')
-                    unique_delets = np.unique(indexes_to_delete)
-                    # print(min_h_neg[0],unique_delets)
-                    unique_delets_int = unique_delets[unique_delets < min_h_neg[0]]
-
-                indexer_lines_deletions_len.append(len(deletions))
-                indexr_uniq_ind.append([deletions])
-
-            else:
-                indexer_lines_deletions_len.append(0)
-                indexr_uniq_ind.append(-999)
-
-            index_line_true = min_h_neg[0] - len(unique_delets_int)
-            # print(index_line_true)
-            if index_line_true > 0 and min_h_neg[0] >= 2:
-                index_line_true = index_line_true
-            else:
-                index_line_true = min_h_neg[0]
-
-            indexer_lines.append(index_line_true)
-
-            if len(unique_delets_int) > 0:
-                for dd in range(len(unique_delets_int)):
-                    indexes_to_delete.append(unique_delets_int[dd])
-        else:
-            indexer_lines.append(-999)
-            indexer_lines_deletions_len.append(-999)
-            indexr_uniq_ind.append(-999)
-
-    peaks_true = []
-    for m in range(len(peaks_neg_fin_t)):
-        if m in indexes_to_delete:
-            pass
-        else:
-            peaks_true.append(peaks_neg_fin_t[m])
-    return indexer_lines, peaks_true, arg_min_hor_sort, indexer_lines_deletions_len, indexr_uniq_ind
-
 def combine_hor_lines_and_delete_cross_points_and_get_lines_features_back_new(img_p_in_ver, img_in_hor,num_col_classifier):
     #img_p_in_ver = cv2.erode(img_p_in_ver, self.kernel, iterations=2)
     img_p_in_ver=img_p_in_ver.astype(np.uint8)

From 133982380feb0521e991e18fc4735ee33a0fcd94 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Wed, 24 Feb 2021 15:17:02 +0100
Subject: [PATCH 78/89] implent_law_head_main_not_parallel is unused

---
 sbb_newspapers_org_image/eynollah.py       |  2 +-
 sbb_newspapers_org_image/unused.py         | 93 ++++++++++++++++++++++
 sbb_newspapers_org_image/utils/__init__.py | 93 ----------------------
 3 files changed, 94 insertions(+), 94 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index d1ee450..0a80342 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -1,5 +1,6 @@
 # pylint: disable=no-member,invalid-name,line-too-long,missing-function-docstring
 # pylint: disable=too-many-locals,wrong-import-position,too-many-lines
+# pylint: disable=too-many-public-methods
 """
 tool to extract table form data from alto xml data
 """
@@ -74,7 +75,6 @@ from .utils import (
     small_textlines_to_parent_adherence2,
     order_and_id_of_texts,
     order_of_regions,
-    implent_law_head_main_not_parallel,
     find_number_of_columns_in_document,
     return_boxes_of_images_by_order_of_reading_new,
 )
diff --git a/sbb_newspapers_org_image/unused.py b/sbb_newspapers_org_image/unused.py
index 39e51a5..d61af37 100644
--- a/sbb_newspapers_org_image/unused.py
+++ b/sbb_newspapers_org_image/unused.py
@@ -3252,3 +3252,96 @@ def return_hor_spliter_by_index(peaks_neg_fin_t, x_min_hor_some, x_max_hor_some)
             peaks_true.append(peaks_neg_fin_t[m])
     return indexer_lines, peaks_true, arg_min_hor_sort, indexer_lines_deletions_len, indexr_uniq_ind
 
+def implent_law_head_main_not_parallel(text_regions):
+    # print(text_regions.shape)
+    text_indexes = [1, 2]  # 1: main text , 2: header , 3: comments
+
+    for t_i in text_indexes:
+        textline_mask = text_regions[:, :] == t_i
+        textline_mask = textline_mask * 255.0
+
+        textline_mask = textline_mask.astype(np.uint8)
+        textline_mask = np.repeat(textline_mask[:, :, np.newaxis], 3, axis=2)
+        kernel = np.ones((5, 5), np.uint8)
+
+        # print(type(textline_mask),np.unique(textline_mask),textline_mask.shape)
+        imgray = cv2.cvtColor(textline_mask, cv2.COLOR_BGR2GRAY)
+        ret, thresh = cv2.threshold(imgray, 0, 255, 0)
+
+        if t_i == 1:
+            contours_main, hirarchy = cv2.findContours(thresh.copy(), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
+            # print(type(contours_main))
+            areas_main = np.array([cv2.contourArea(contours_main[j]) for j in range(len(contours_main))])
+            M_main = [cv2.moments(contours_main[j]) for j in range(len(contours_main))]
+            cx_main = [(M_main[j]["m10"] / (M_main[j]["m00"] + 1e-32)) for j in range(len(M_main))]
+            cy_main = [(M_main[j]["m01"] / (M_main[j]["m00"] + 1e-32)) for j in range(len(M_main))]
+            x_min_main = np.array([np.min(contours_main[j][:, 0, 0]) for j in range(len(contours_main))])
+            x_max_main = np.array([np.max(contours_main[j][:, 0, 0]) for j in range(len(contours_main))])
+
+            y_min_main = np.array([np.min(contours_main[j][:, 0, 1]) for j in range(len(contours_main))])
+            y_max_main = np.array([np.max(contours_main[j][:, 0, 1]) for j in range(len(contours_main))])
+            # print(contours_main[0],np.shape(contours_main[0]),contours_main[0][:,0,0])
+        elif t_i == 2:
+            contours_header, hirarchy = cv2.findContours(thresh.copy(), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
+            # print(type(contours_header))
+            areas_header = np.array([cv2.contourArea(contours_header[j]) for j in range(len(contours_header))])
+            M_header = [cv2.moments(contours_header[j]) for j in range(len(contours_header))]
+            cx_header = [(M_header[j]["m10"] / (M_header[j]["m00"] + 1e-32)) for j in range(len(M_header))]
+            cy_header = [(M_header[j]["m01"] / (M_header[j]["m00"] + 1e-32)) for j in range(len(M_header))]
+
+            x_min_header = np.array([np.min(contours_header[j][:, 0, 0]) for j in range(len(contours_header))])
+            x_max_header = np.array([np.max(contours_header[j][:, 0, 0]) for j in range(len(contours_header))])
+
+            y_min_header = np.array([np.min(contours_header[j][:, 0, 1]) for j in range(len(contours_header))])
+            y_max_header = np.array([np.max(contours_header[j][:, 0, 1]) for j in range(len(contours_header))])
+
+    args = np.array(range(1, len(cy_header) + 1))
+    args_main = np.array(range(1, len(cy_main) + 1))
+    for jj in range(len(contours_main)):
+        headers_in_main = [(cy_header > y_min_main[jj]) & ((cy_header < y_max_main[jj]))]
+        mains_in_main = [(cy_main > y_min_main[jj]) & ((cy_main < y_max_main[jj]))]
+        args_log = args * headers_in_main
+        res = args_log[args_log > 0]
+        res_true = res - 1
+
+        args_log_main = args_main * mains_in_main
+        res_main = args_log_main[args_log_main > 0]
+        res_true_main = res_main - 1
+
+        if len(res_true) > 0:
+            sum_header = np.sum(areas_header[res_true])
+            sum_main = np.sum(areas_main[res_true_main])
+            if sum_main > sum_header:
+                cnt_int = [contours_header[j] for j in res_true]
+                text_regions = cv2.fillPoly(text_regions, pts=cnt_int, color=(1, 1, 1))
+            else:
+                cnt_int = [contours_main[j] for j in res_true_main]
+                text_regions = cv2.fillPoly(text_regions, pts=cnt_int, color=(2, 2, 2))
+
+    for jj in range(len(contours_header)):
+        main_in_header = [(cy_main > y_min_header[jj]) & ((cy_main < y_max_header[jj]))]
+        header_in_header = [(cy_header > y_min_header[jj]) & ((cy_header < y_max_header[jj]))]
+        args_log = args_main * main_in_header
+        res = args_log[args_log > 0]
+        res_true = res - 1
+
+        args_log_header = args * header_in_header
+        res_header = args_log_header[args_log_header > 0]
+        res_true_header = res_header - 1
+
+        if len(res_true) > 0:
+
+            sum_header = np.sum(areas_header[res_true_header])
+            sum_main = np.sum(areas_main[res_true])
+
+            if sum_main > sum_header:
+
+                cnt_int = [contours_header[j] for j in res_true_header]
+                text_regions = cv2.fillPoly(text_regions, pts=cnt_int, color=(1, 1, 1))
+            else:
+                cnt_int = [contours_main[j] for j in res_true]
+                text_regions = cv2.fillPoly(text_regions, pts=cnt_int, color=(2, 2, 2))
+
+    return text_regions
+
+
diff --git a/sbb_newspapers_org_image/utils/__init__.py b/sbb_newspapers_org_image/utils/__init__.py
index aed02fc..a3f7686 100644
--- a/sbb_newspapers_org_image/utils/__init__.py
+++ b/sbb_newspapers_org_image/utils/__init__.py
@@ -1202,99 +1202,6 @@ def order_of_regions(textline_mask, contours_main, contours_header, y_ref):
 
     return final_indexers_sorted, matrix_of_orders, final_types, final_index_type
 
-def implent_law_head_main_not_parallel(text_regions):
-    # print(text_regions.shape)
-    text_indexes = [1, 2]  # 1: main text , 2: header , 3: comments
-
-    for t_i in text_indexes:
-        textline_mask = text_regions[:, :] == t_i
-        textline_mask = textline_mask * 255.0
-
-        textline_mask = textline_mask.astype(np.uint8)
-        textline_mask = np.repeat(textline_mask[:, :, np.newaxis], 3, axis=2)
-        kernel = np.ones((5, 5), np.uint8)
-
-        # print(type(textline_mask),np.unique(textline_mask),textline_mask.shape)
-        imgray = cv2.cvtColor(textline_mask, cv2.COLOR_BGR2GRAY)
-        ret, thresh = cv2.threshold(imgray, 0, 255, 0)
-
-        if t_i == 1:
-            contours_main, hirarchy = cv2.findContours(thresh.copy(), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
-            # print(type(contours_main))
-            areas_main = np.array([cv2.contourArea(contours_main[j]) for j in range(len(contours_main))])
-            M_main = [cv2.moments(contours_main[j]) for j in range(len(contours_main))]
-            cx_main = [(M_main[j]["m10"] / (M_main[j]["m00"] + 1e-32)) for j in range(len(M_main))]
-            cy_main = [(M_main[j]["m01"] / (M_main[j]["m00"] + 1e-32)) for j in range(len(M_main))]
-            x_min_main = np.array([np.min(contours_main[j][:, 0, 0]) for j in range(len(contours_main))])
-            x_max_main = np.array([np.max(contours_main[j][:, 0, 0]) for j in range(len(contours_main))])
-
-            y_min_main = np.array([np.min(contours_main[j][:, 0, 1]) for j in range(len(contours_main))])
-            y_max_main = np.array([np.max(contours_main[j][:, 0, 1]) for j in range(len(contours_main))])
-            # print(contours_main[0],np.shape(contours_main[0]),contours_main[0][:,0,0])
-        elif t_i == 2:
-            contours_header, hirarchy = cv2.findContours(thresh.copy(), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
-            # print(type(contours_header))
-            areas_header = np.array([cv2.contourArea(contours_header[j]) for j in range(len(contours_header))])
-            M_header = [cv2.moments(contours_header[j]) for j in range(len(contours_header))]
-            cx_header = [(M_header[j]["m10"] / (M_header[j]["m00"] + 1e-32)) for j in range(len(M_header))]
-            cy_header = [(M_header[j]["m01"] / (M_header[j]["m00"] + 1e-32)) for j in range(len(M_header))]
-
-            x_min_header = np.array([np.min(contours_header[j][:, 0, 0]) for j in range(len(contours_header))])
-            x_max_header = np.array([np.max(contours_header[j][:, 0, 0]) for j in range(len(contours_header))])
-
-            y_min_header = np.array([np.min(contours_header[j][:, 0, 1]) for j in range(len(contours_header))])
-            y_max_header = np.array([np.max(contours_header[j][:, 0, 1]) for j in range(len(contours_header))])
-
-    args = np.array(range(1, len(cy_header) + 1))
-    args_main = np.array(range(1, len(cy_main) + 1))
-    for jj in range(len(contours_main)):
-        headers_in_main = [(cy_header > y_min_main[jj]) & ((cy_header < y_max_main[jj]))]
-        mains_in_main = [(cy_main > y_min_main[jj]) & ((cy_main < y_max_main[jj]))]
-        args_log = args * headers_in_main
-        res = args_log[args_log > 0]
-        res_true = res - 1
-
-        args_log_main = args_main * mains_in_main
-        res_main = args_log_main[args_log_main > 0]
-        res_true_main = res_main - 1
-
-        if len(res_true) > 0:
-            sum_header = np.sum(areas_header[res_true])
-            sum_main = np.sum(areas_main[res_true_main])
-            if sum_main > sum_header:
-                cnt_int = [contours_header[j] for j in res_true]
-                text_regions = cv2.fillPoly(text_regions, pts=cnt_int, color=(1, 1, 1))
-            else:
-                cnt_int = [contours_main[j] for j in res_true_main]
-                text_regions = cv2.fillPoly(text_regions, pts=cnt_int, color=(2, 2, 2))
-
-    for jj in range(len(contours_header)):
-        main_in_header = [(cy_main > y_min_header[jj]) & ((cy_main < y_max_header[jj]))]
-        header_in_header = [(cy_header > y_min_header[jj]) & ((cy_header < y_max_header[jj]))]
-        args_log = args_main * main_in_header
-        res = args_log[args_log > 0]
-        res_true = res - 1
-
-        args_log_header = args * header_in_header
-        res_header = args_log_header[args_log_header > 0]
-        res_true_header = res_header - 1
-
-        if len(res_true) > 0:
-
-            sum_header = np.sum(areas_header[res_true_header])
-            sum_main = np.sum(areas_main[res_true])
-
-            if sum_main > sum_header:
-
-                cnt_int = [contours_header[j] for j in res_true_header]
-                text_regions = cv2.fillPoly(text_regions, pts=cnt_int, color=(1, 1, 1))
-            else:
-                cnt_int = [contours_main[j] for j in res_true]
-                text_regions = cv2.fillPoly(text_regions, pts=cnt_int, color=(2, 2, 2))
-
-    return text_regions
-
-
 def combine_hor_lines_and_delete_cross_points_and_get_lines_features_back_new(img_p_in_ver, img_in_hor,num_col_classifier):
     #img_p_in_ver = cv2.erode(img_p_in_ver, self.kernel, iterations=2)
     img_p_in_ver=img_p_in_ver.astype(np.uint8)

From c80fddb3b80cee5208b9657afb90fbf5090e55c6 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Wed, 24 Feb 2021 15:23:55 +0100
Subject: [PATCH 79/89] remove unused delete_seperator_around

---
 sbb_newspapers_org_image/eynollah.py       | 10 ++++------
 sbb_newspapers_org_image/unused.py         | 20 ++++++++++++++++++++
 sbb_newspapers_org_image/utils/__init__.py | 22 ----------------------
 3 files changed, 24 insertions(+), 28 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index 0a80342..7d32c14 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -1,6 +1,6 @@
 # pylint: disable=no-member,invalid-name,line-too-long,missing-function-docstring
-# pylint: disable=too-many-locals,wrong-import-position,too-many-lines
-# pylint: disable=too-many-public-methods
+# pylint: disable=too-many-locals,wrong-import-position,too-many-lines,too-many-statements
+# pylint: disable=too-many-public-methods,too-many-arguments,too-many-instance-attributes,too-many-public-methods,
 """
 tool to extract table form data from alto xml data
 """
@@ -67,8 +67,6 @@ from .utils import (
     crop_image_inside_box,
     find_num_col,
     otsu_copy_binary,
-    delete_seperator_around,
-    return_regions_without_seperators,
     put_drop_out_from_only_drop_model,
     putt_bb_of_drop_capitals_of_model_in_patches_in_layout,
     check_any_text_region_in_model_one_is_main_or_header,
@@ -107,7 +105,7 @@ class eynollah:
         allow_scaling=False,
         headers_off=False
     ):
-        self.image_filename = image_filename  # XXX This does not seem to be a directory as the name suggests, but a file
+        self.image_filename = image_filename
         self.cont_page = []
         self.dir_out = dir_out
         self.image_filename_stem = image_filename_stem
@@ -137,7 +135,7 @@ class eynollah:
         self.model_region_dir_fully = dir_models + "/model_3up_new_good_no_augmentation.h5"
         self.model_page_dir = dir_models + "/model_page_mixed_best.h5"
         self.model_region_dir_p_ens = dir_models + "/model_ensemble_s.h5"
-        self.model_textline_dir = dir_models + "/model_textline_newspapers.h5" 
+        self.model_textline_dir = dir_models + "/model_textline_newspapers.h5"
 
         self._imgs = {}
 
diff --git a/sbb_newspapers_org_image/unused.py b/sbb_newspapers_org_image/unused.py
index d61af37..e3514b8 100644
--- a/sbb_newspapers_org_image/unused.py
+++ b/sbb_newspapers_org_image/unused.py
@@ -3344,4 +3344,24 @@ def implent_law_head_main_not_parallel(text_regions):
 
     return text_regions
 
+def delete_seperator_around(spliter_y, peaks_neg, image_by_region):
+    # format of subboxes box=[x1, x2 , y1, y2]
+
+    if len(image_by_region.shape) == 3:
+        for i in range(len(spliter_y) - 1):
+            for j in range(1, len(peaks_neg[i]) - 1):
+                image_by_region[int(spliter_y[i]) : int(spliter_y[i + 1]), peaks_neg[i][j] - int(1.0 / 20.0 * peaks_neg[i][j]) : peaks_neg[i][j] + int(1.0 / 20.0 * peaks_neg[i][j]), 0][image_by_region[int(spliter_y[i]) : int(spliter_y[i + 1]), peaks_neg[i][j] - int(1.0 / 20.0 * peaks_neg[i][j]) : peaks_neg[i][j] + int(1.0 / 20.0 * peaks_neg[i][j]), 0] == 6] = 0
+                image_by_region[spliter_y[i] : spliter_y[i + 1], peaks_neg[i][j] - int(1.0 / 20.0 * peaks_neg[i][j]) : peaks_neg[i][j] + int(1.0 / 20.0 * peaks_neg[i][j]), 0][image_by_region[int(spliter_y[i]) : int(spliter_y[i + 1]), peaks_neg[i][j] - int(1.0 / 20.0 * peaks_neg[i][j]) : peaks_neg[i][j] + int(1.0 / 20.0 * peaks_neg[i][j]), 1] == 6] = 0
+                image_by_region[spliter_y[i] : spliter_y[i + 1], peaks_neg[i][j] - int(1.0 / 20.0 * peaks_neg[i][j]) : peaks_neg[i][j] + int(1.0 / 20.0 * peaks_neg[i][j]), 0][image_by_region[int(spliter_y[i]) : int(spliter_y[i + 1]), peaks_neg[i][j] - int(1.0 / 20.0 * peaks_neg[i][j]) : peaks_neg[i][j] + int(1.0 / 20.0 * peaks_neg[i][j]), 2] == 6] = 0
+
+                image_by_region[int(spliter_y[i]) : int(spliter_y[i + 1]), peaks_neg[i][j] - int(1.0 / 20.0 * peaks_neg[i][j]) : peaks_neg[i][j] + int(1.0 / 20.0 * peaks_neg[i][j]), 0][image_by_region[int(spliter_y[i]) : int(spliter_y[i + 1]), peaks_neg[i][j] - int(1.0 / 20.0 * peaks_neg[i][j]) : peaks_neg[i][j] + int(1.0 / 20.0 * peaks_neg[i][j]), 0] == 7] = 0
+                image_by_region[int(spliter_y[i]) : int(spliter_y[i + 1]), peaks_neg[i][j] - int(1.0 / 20.0 * peaks_neg[i][j]) : peaks_neg[i][j] + int(1.0 / 20.0 * peaks_neg[i][j]), 0][image_by_region[int(spliter_y[i]) : int(spliter_y[i + 1]), peaks_neg[i][j] - int(1.0 / 20.0 * peaks_neg[i][j]) : peaks_neg[i][j] + int(1.0 / 20.0 * peaks_neg[i][j]), 1] == 7] = 0
+                image_by_region[int(spliter_y[i]) : int(spliter_y[i + 1]), peaks_neg[i][j] - int(1.0 / 20.0 * peaks_neg[i][j]) : peaks_neg[i][j] + int(1.0 / 20.0 * peaks_neg[i][j]), 0][image_by_region[int(spliter_y[i]) : int(spliter_y[i + 1]), peaks_neg[i][j] - int(1.0 / 20.0 * peaks_neg[i][j]) : peaks_neg[i][j] + int(1.0 / 20.0 * peaks_neg[i][j]), 2] == 7] = 0
+    else:
+        for i in range(len(spliter_y) - 1):
+            for j in range(1, len(peaks_neg[i]) - 1):
+                image_by_region[int(spliter_y[i]) : int(spliter_y[i + 1]), peaks_neg[i][j] - int(1.0 / 20.0 * peaks_neg[i][j]) : peaks_neg[i][j] + int(1.0 / 20.0 * peaks_neg[i][j])][image_by_region[int(spliter_y[i]) : int(spliter_y[i + 1]), peaks_neg[i][j] - int(1.0 / 20.0 * peaks_neg[i][j]) : peaks_neg[i][j] + int(1.0 / 20.0 * peaks_neg[i][j])] == 6] = 0
+
+                image_by_region[int(spliter_y[i]) : int(spliter_y[i + 1]), peaks_neg[i][j] - int(1.0 / 20.0 * peaks_neg[i][j]) : peaks_neg[i][j] + int(1.0 / 20.0 * peaks_neg[i][j])][image_by_region[int(spliter_y[i]) : int(spliter_y[i + 1]), peaks_neg[i][j] - int(1.0 / 20.0 * peaks_neg[i][j]) : peaks_neg[i][j] + int(1.0 / 20.0 * peaks_neg[i][j])] == 7] = 0
+    return image_by_region
 
diff --git a/sbb_newspapers_org_image/utils/__init__.py b/sbb_newspapers_org_image/utils/__init__.py
index a3f7686..120dfc5 100644
--- a/sbb_newspapers_org_image/utils/__init__.py
+++ b/sbb_newspapers_org_image/utils/__init__.py
@@ -718,28 +718,6 @@ def find_num_col_by_vertical_lines(regions_without_seperators, multiplier=3.8):
     # print(peaks,'peaksnew')
     return peaks
 
-
-def delete_seperator_around(spliter_y, peaks_neg, image_by_region):
-    # format of subboxes box=[x1, x2 , y1, y2]
-
-    if len(image_by_region.shape) == 3:
-        for i in range(len(spliter_y) - 1):
-            for j in range(1, len(peaks_neg[i]) - 1):
-                image_by_region[int(spliter_y[i]) : int(spliter_y[i + 1]), peaks_neg[i][j] - int(1.0 / 20.0 * peaks_neg[i][j]) : peaks_neg[i][j] + int(1.0 / 20.0 * peaks_neg[i][j]), 0][image_by_region[int(spliter_y[i]) : int(spliter_y[i + 1]), peaks_neg[i][j] - int(1.0 / 20.0 * peaks_neg[i][j]) : peaks_neg[i][j] + int(1.0 / 20.0 * peaks_neg[i][j]), 0] == 6] = 0
-                image_by_region[spliter_y[i] : spliter_y[i + 1], peaks_neg[i][j] - int(1.0 / 20.0 * peaks_neg[i][j]) : peaks_neg[i][j] + int(1.0 / 20.0 * peaks_neg[i][j]), 0][image_by_region[int(spliter_y[i]) : int(spliter_y[i + 1]), peaks_neg[i][j] - int(1.0 / 20.0 * peaks_neg[i][j]) : peaks_neg[i][j] + int(1.0 / 20.0 * peaks_neg[i][j]), 1] == 6] = 0
-                image_by_region[spliter_y[i] : spliter_y[i + 1], peaks_neg[i][j] - int(1.0 / 20.0 * peaks_neg[i][j]) : peaks_neg[i][j] + int(1.0 / 20.0 * peaks_neg[i][j]), 0][image_by_region[int(spliter_y[i]) : int(spliter_y[i + 1]), peaks_neg[i][j] - int(1.0 / 20.0 * peaks_neg[i][j]) : peaks_neg[i][j] + int(1.0 / 20.0 * peaks_neg[i][j]), 2] == 6] = 0
-
-                image_by_region[int(spliter_y[i]) : int(spliter_y[i + 1]), peaks_neg[i][j] - int(1.0 / 20.0 * peaks_neg[i][j]) : peaks_neg[i][j] + int(1.0 / 20.0 * peaks_neg[i][j]), 0][image_by_region[int(spliter_y[i]) : int(spliter_y[i + 1]), peaks_neg[i][j] - int(1.0 / 20.0 * peaks_neg[i][j]) : peaks_neg[i][j] + int(1.0 / 20.0 * peaks_neg[i][j]), 0] == 7] = 0
-                image_by_region[int(spliter_y[i]) : int(spliter_y[i + 1]), peaks_neg[i][j] - int(1.0 / 20.0 * peaks_neg[i][j]) : peaks_neg[i][j] + int(1.0 / 20.0 * peaks_neg[i][j]), 0][image_by_region[int(spliter_y[i]) : int(spliter_y[i + 1]), peaks_neg[i][j] - int(1.0 / 20.0 * peaks_neg[i][j]) : peaks_neg[i][j] + int(1.0 / 20.0 * peaks_neg[i][j]), 1] == 7] = 0
-                image_by_region[int(spliter_y[i]) : int(spliter_y[i + 1]), peaks_neg[i][j] - int(1.0 / 20.0 * peaks_neg[i][j]) : peaks_neg[i][j] + int(1.0 / 20.0 * peaks_neg[i][j]), 0][image_by_region[int(spliter_y[i]) : int(spliter_y[i + 1]), peaks_neg[i][j] - int(1.0 / 20.0 * peaks_neg[i][j]) : peaks_neg[i][j] + int(1.0 / 20.0 * peaks_neg[i][j]), 2] == 7] = 0
-    else:
-        for i in range(len(spliter_y) - 1):
-            for j in range(1, len(peaks_neg[i]) - 1):
-                image_by_region[int(spliter_y[i]) : int(spliter_y[i + 1]), peaks_neg[i][j] - int(1.0 / 20.0 * peaks_neg[i][j]) : peaks_neg[i][j] + int(1.0 / 20.0 * peaks_neg[i][j])][image_by_region[int(spliter_y[i]) : int(spliter_y[i + 1]), peaks_neg[i][j] - int(1.0 / 20.0 * peaks_neg[i][j]) : peaks_neg[i][j] + int(1.0 / 20.0 * peaks_neg[i][j])] == 6] = 0
-
-                image_by_region[int(spliter_y[i]) : int(spliter_y[i + 1]), peaks_neg[i][j] - int(1.0 / 20.0 * peaks_neg[i][j]) : peaks_neg[i][j] + int(1.0 / 20.0 * peaks_neg[i][j])][image_by_region[int(spliter_y[i]) : int(spliter_y[i + 1]), peaks_neg[i][j] - int(1.0 / 20.0 * peaks_neg[i][j]) : peaks_neg[i][j] + int(1.0 / 20.0 * peaks_neg[i][j])] == 7] = 0
-    return image_by_region
-
 def return_regions_without_seperators(regions_pre):
     kernel = np.ones((5, 5), np.uint8)
     regions_without_seperators = ((regions_pre[:, :] != 6) & (regions_pre[:, :] != 0)) * 1

From 0eda4a174aea46da3944cb5e8ffb9985961d8eda Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Wed, 24 Feb 2021 15:53:31 +0100
Subject: [PATCH 80/89] move xml_reading_order to utils.xml

---
 sbb_newspapers_org_image/eynollah.py  | 46 +++++++--------------------
 sbb_newspapers_org_image/utils/xml.py | 23 ++++++++++++++
 2 files changed, 34 insertions(+), 35 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index 7d32c14..b4779da 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -1,6 +1,7 @@
-# pylint: disable=no-member,invalid-name,line-too-long,missing-function-docstring
-# pylint: disable=too-many-locals,wrong-import-position,too-many-lines,too-many-statements
+# pylint: disable=no-member,invalid-name,line-too-long,missing-function-docstring,missing-class-docstring,too-many-branches
+# pylint: disable=too-many-locals,wrong-import-position,too-many-lines,too-many-statements,chained-comparison,fixme,broad-except,c-extension-no-member
 # pylint: disable=too-many-public-methods,too-many-arguments,too-many-instance-attributes,too-many-public-methods,
+# pylint: disable=consider-using-enumerate
 """
 tool to extract table form data from alto xml data
 """
@@ -77,7 +78,7 @@ from .utils import (
     return_boxes_of_images_by_order_of_reading_new,
 )
 
-from .utils.xml import create_page_xml, add_textequiv
+from .utils.xml import create_page_xml, add_textequiv, xml_reading_order
 from .utils.pil_cv2 import check_dpi
 from .plot import EynollahPlotter
 
@@ -384,6 +385,7 @@ class eynollah:
         self.logger.debug("exit resize_and_enhance_image_with_column_classifier")
         return is_image_enhanced, img, image_res, num_col, num_column_is_classified
 
+    # pylint: disable=attribute-defined-outside-init
     def get_image_and_scales(self, img_org, img_res, scale):
         self.logger.debug("enter get_image_and_scales")
         self.image = np.copy(img_res)
@@ -1057,7 +1059,7 @@ class eynollah:
     def calculate_page_coords(self):
         self.logger.debug('enter calculate_page_coords')
         points_page_print = ""
-        for lmm, contour in enumerate(self.cont_page[0]):
+        for _, contour in enumerate(self.cont_page[0]):
             if len(contour) == 2:
                 points_page_print += str(int((contour[0]) / self.scale_x))
                 points_page_print += ','
@@ -1069,28 +1071,6 @@ class eynollah:
             points_page_print = points_page_print + ' '
         return points_page_print[:-1]
 
-    def xml_reading_order(self, page, order_of_texts, id_of_texts, id_of_marginalia, found_polygons_marginals):
-        """
-        XXX side-effect: extends id_of_marginalia
-        """
-        region_order = ET.SubElement(page, 'ReadingOrder')
-        region_order_sub = ET.SubElement(region_order, 'OrderedGroup')
-        region_order_sub.set('id', "ro357564684568544579089")
-        indexer_region = 0
-        for vj in order_of_texts:
-            name = "coord_text_%s" % vj
-            name = ET.SubElement(region_order_sub, 'RegionRefIndexed')
-            name.set('index', str(indexer_region))
-            name.set('regionRef', id_of_texts[vj])
-            indexer_region += 1
-        for vm in range(len(found_polygons_marginals)):
-            id_of_marginalia.append('r%s' % indexer_region)
-            name = "coord_text_%s" % indexer_region
-            name = ET.SubElement(region_order_sub, 'RegionRefIndexed')
-            name.set('index', str(indexer_region))
-            name.set('regionRef', 'r%s' % indexer_region)
-            indexer_region += 1
-
     def serialize_lines_in_marginal(self, marginal, all_found_texline_polygons_marginals, marginal_idx, page_coord, all_box_coord_marginals, id_indexer_l):
         for j in range(len(all_found_texline_polygons_marginals[marginal_idx])):
             textline = ET.SubElement(marginal, 'TextLine')
@@ -1187,7 +1167,7 @@ class eynollah:
         id_indexer = 0
         id_indexer_l = 0
         if len(found_polygons_text_region) > 0:
-            self.xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia, found_polygons_marginals)
+            xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia, found_polygons_marginals)
             for mm in range(len(found_polygons_text_region)):
                 textregion = ET.SubElement(page, 'TextRegion')
                 textregion.set('id', 'r%s' % id_indexer)
@@ -1237,7 +1217,7 @@ class eynollah:
         id_of_marginalia = []
 
         if len(found_polygons_text_region) > 0:
-            self.xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia, found_polygons_marginals)
+            xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia, found_polygons_marginals)
             for mm in range(len(found_polygons_text_region)):
                 textregion=ET.SubElement(page, 'TextRegion')
                 textregion.set('id', 'r%s' % id_indexer)
@@ -1561,7 +1541,7 @@ class eynollah:
                 indexes_sorted_main = np.array(indexes_sorted)[np.array(kind_of_texts_sorted) == 1]
                 indexes_by_type_main = np.array(index_by_kind_sorted)[np.array(kind_of_texts_sorted) == 1]
 
-                for zahler, mtv in enumerate(args_contours_box):
+                for zahler, _ in enumerate(args_contours_box):
                     arg_order_v = indexes_sorted_main[zahler]
                     tartib = np.where(indexes_sorted == arg_order_v)[0][0]
                     order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = tartib + ref_point
@@ -1666,7 +1646,6 @@ class eynollah:
         except Exception as why:
             self.logger.error(why)
             num_col = None
-            peaks_neg_fin = []
         return num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, text_regions_p_1
 
     def run_enhancement(self):
@@ -1720,11 +1699,8 @@ class eynollah:
         image_page_rotated, textline_mask_tot = image_page[:, :], textline_mask_tot_ea[:, :]
         textline_mask_tot[mask_images[:, :] == 1] = 0
 
-        pixel_img = 1
-        min_area = 0.00001
-        max_area = 0.0006
         text_regions_p_1[mask_lines[:, :] == 1] = 3
-        text_regions_p = text_regions_p_1[:, :]  # long_short_region[:,:]#self.get_regions_from_2_models(image_page)
+        text_regions_p = text_regions_p_1[:, :]
         text_regions_p = np.array(text_regions_p)
 
         if num_col_classifier in (1, 2):
@@ -2025,7 +2001,7 @@ class eynollah:
             scale_param = 1
             all_found_texline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con, slopes = self.get_slopes_and_deskew_new_curved(txt_con_org, contours_only_text_parent, cv2.erode(textline_mask_tot_ea, kernel=KERNEL, iterations=1), image_page_rotated, boxes_text, text_only, num_col_classifier, scale_param, slope_deskew)
             all_found_texline_polygons = small_textlines_to_parent_adherence2(all_found_texline_polygons, textline_mask_tot_ea, num_col_classifier)
-            all_found_texline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, _, slopes_marginals = self.get_slopes_and_deskew_new_curved(polygons_of_marginals, polygons_of_marginals, cv2.erode(textline_mask_tot_ea, kernel=KERNEL, iterations=1), image_page_rotated, boxes_marginals, text_only, num_col_classifier, scale_param, slope_deskew)
+            all_found_texline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, _, _ = self.get_slopes_and_deskew_new_curved(polygons_of_marginals, polygons_of_marginals, cv2.erode(textline_mask_tot_ea, kernel=KERNEL, iterations=1), image_page_rotated, boxes_marginals, text_only, num_col_classifier, scale_param, slope_deskew)
             all_found_texline_polygons_marginals = small_textlines_to_parent_adherence2(all_found_texline_polygons_marginals, textline_mask_tot_ea, num_col_classifier)
 
         K.clear_session()
diff --git a/sbb_newspapers_org_image/utils/xml.py b/sbb_newspapers_org_image/utils/xml.py
index 072bca5..bba7db8 100644
--- a/sbb_newspapers_org_image/utils/xml.py
+++ b/sbb_newspapers_org_image/utils/xml.py
@@ -36,3 +36,26 @@ def add_textequiv(parent, text=''):
     textequiv = ET.SubElement(parent, 'TextEquiv')
     unireg = ET.SubElement(textequiv, 'Unicode')
     unireg.text = text
+
+def xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia, found_polygons_marginals):
+    """
+    XXX side-effect: extends id_of_marginalia
+    """
+    region_order = ET.SubElement(page, 'ReadingOrder')
+    region_order_sub = ET.SubElement(region_order, 'OrderedGroup')
+    region_order_sub.set('id', "ro357564684568544579089")
+    indexer_region = 0
+    for vj in order_of_texts:
+        name = "coord_text_%s" % vj
+        name = ET.SubElement(region_order_sub, 'RegionRefIndexed')
+        name.set('index', str(indexer_region))
+        name.set('regionRef', id_of_texts[vj])
+        indexer_region += 1
+    for vm in range(len(found_polygons_marginals)):
+        id_of_marginalia.append('r%s' % indexer_region)
+        name = "coord_text_%s" % indexer_region
+        name = ET.SubElement(region_order_sub, 'RegionRefIndexed')
+        name.set('index', str(indexer_region))
+        name.set('regionRef', 'r%s' % indexer_region)
+        indexer_region += 1
+

From 4c81fa2e46d67a07315bf0cc35f43616a44dfd50 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Wed, 24 Feb 2021 16:05:30 +0100
Subject: [PATCH 81/89] simplify constructs, remove print-debugging stmts

---
 sbb_newspapers_org_image/eynollah.py | 52 +++++-----------------------
 1 file changed, 9 insertions(+), 43 deletions(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index b4779da..258bedf 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -1392,11 +1392,11 @@ class eynollah:
                 con_inter_box = []
                 con_inter_box_h = []
 
-                for i in range(len(args_contours_box)):
-                    con_inter_box.append(contours_only_text_parent[args_contours_box[i]])
+                for box in args_contours_box:
+                    con_inter_box.append(contours_only_text_parent[box])
 
-                for i in range(len(args_contours_box_h)):
-                    con_inter_box_h.append(contours_only_text_parent_h[args_contours_box_h[i]])
+                for box in args_contours_box_h:
+                    con_inter_box_h.append(contours_only_text_parent_h[box])
 
                 indexes_sorted, matrix_of_orders, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions(textline_mask_tot[int(boxes[iij][2]) : int(boxes[iij][3]), int(boxes[iij][0]) : int(boxes[iij][1])], con_inter_box, con_inter_box_h, boxes[iij][2])
 
@@ -1431,8 +1431,7 @@ class eynollah:
 
             order_text_new = []
             for iii in range(len(order_of_texts_tot)):
-                tartib_new = np.where(np.array(order_of_texts_tot) == iii)[0][0]
-                order_text_new.append(tartib_new)
+                order_text_new.append(np.where(np.array(order_of_texts_tot) == iii)[0][0])
 
         except Exception as why:
             self.logger.error(why)
@@ -1506,8 +1505,7 @@ class eynollah:
 
             order_text_new = []
             for iii in range(len(order_of_texts_tot)):
-                tartib_new = np.where(np.array(order_of_texts_tot) == iii)[0][0]
-                order_text_new.append(tartib_new)
+                order_text_new.append(np.where(np.array(order_of_texts_tot) == iii)[0][0])
         return order_text_new, id_of_texts_tot
 
     def do_order_of_regions_no_full_layout(self, contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot):
@@ -1557,8 +1555,7 @@ class eynollah:
 
             order_text_new = []
             for iii in range(len(order_of_texts_tot)):
-                tartib_new = np.where(np.array(order_of_texts_tot) == iii)[0][0]
-                order_text_new.append(tartib_new)
+                order_text_new.append(np.where(np.array(order_of_texts_tot) == iii)[0][0])
 
         except Exception as why:
             self.logger.error(why)
@@ -1590,7 +1587,7 @@ class eynollah:
                 indexes_sorted_main = np.array(indexes_sorted)[np.array(kind_of_texts_sorted) == 1]
                 indexes_by_type_main = np.array(index_by_kind_sorted)[np.array(kind_of_texts_sorted) == 1]
 
-                for zahler, mtv in enumerate(args_contours_box):
+                for zahler, _ in enumerate(args_contours_box):
                     arg_order_v = indexes_sorted_main[zahler]
                     tartib = np.where(indexes_sorted == arg_order_v)[0][0]
                     order_by_con_main[args_contours_box[indexes_by_type_main[zahler]]] = tartib + ref_point
@@ -1606,8 +1603,7 @@ class eynollah:
 
             order_text_new = []
             for iii in range(len(order_of_texts_tot)):
-                tartib_new = np.where(np.array(order_of_texts_tot) == iii)[0][0]
-                order_text_new.append(tartib_new)
+                order_text_new.append(np.where(np.array(order_of_texts_tot) == iii)[0][0])
 
         return order_text_new, id_of_texts_tot
 
@@ -1675,11 +1671,7 @@ class eynollah:
         scaler_h_textline = 1  # 1.2#1.2
         scaler_w_textline = 1  # 0.9#1
         textline_mask_tot_ea, _ = self.textline_contours(image_page, True, scaler_h_textline, scaler_w_textline)
-
         K.clear_session()
-        #print(np.unique(textline_mask_tot_ea[:, :]), "textline")
-        # plt.imshow(textline_mask_tot_ea)
-        # plt.show()
         if self.plotter:
             self.plotter.save_plot_of_textlines(textline_mask_tot_ea, image_page)
         return textline_mask_tot_ea
@@ -1878,21 +1870,9 @@ class eynollah:
         # plt.imshow(img_revised_tab)
         # plt.show()
 
-        # print(img_revised_tab.shape,text_regions_p_1_n.shape)
-        # text_regions_p_1_n=resize_image(text_regions_p_1_n,img_revised_tab.shape[0],img_revised_tab.shape[1])
-        # print(np.unique(text_regions_p_1_n),'uni')
-
         text_only = ((img_revised_tab[:, :] == 1)) * 1
         if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
             text_only_d = ((text_regions_p_1_n[:, :] == 1)) * 1
-        ##text_only_h=( (img_revised_tab[:,:,0]==2) )*1
-
-        # print(text_only.shape,text_only_d.shape)
-        # plt.imshow(text_only)
-        # plt.show()
-
-        # plt.imshow(text_only_d)
-        # plt.show()
 
         min_con_area = 0.000005
         if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
@@ -1943,26 +1923,12 @@ class eynollah:
             x_diff = p_big[0] - cx_bigest_d_big
             y_diff = p_big[1] - cy_biggest_d_big
 
-            # print(p_big)
-            # print(cx_bigest_d_big,cy_biggest_d_big)
-            # print(x_diff,y_diff)
-
             contours_only_text_parent_d_ordered = []
             for i in range(len(contours_only_text_parent)):
-                # img1=np.zeros((text_only.shape[0],text_only.shape[1],3))
-                # img1=cv2.fillPoly(img1,pts=[contours_only_text_parent[i]] ,color=(1,1,1))
-                # plt.imshow(img1[:,:,0])
-                # plt.show()
-
                 p = np.dot(M_22, [cx_bigest[i], cy_biggest[i]])
-                # print(p)
                 p[0] = p[0] - x_diff[0]
                 p[1] = p[1] - y_diff[0]
-                # print(p)
-                # print(cx_bigest_d)
-                # print(cy_biggest_d)
                 dists = [math.sqrt((p[0] - cx_bigest_d[j]) ** 2 + (p[1] - cy_biggest_d[j]) ** 2) for j in range(len(cx_bigest_d))]
-                # print(np.argmin(dists))
                 contours_only_text_parent_d_ordered.append(contours_only_text_parent_d[np.argmin(dists)])
                 # img2=np.zeros((text_only.shape[0],text_only.shape[1],3))
                 # img2=cv2.fillPoly(img2,pts=[contours_only_text_parent_d[np.argmin(dists)]] ,color=(1,1,1))

From 045ab86fd5dff9186631add77fc893aa3a7efea7 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Wed, 24 Feb 2021 16:29:40 +0100
Subject: [PATCH 82/89] eynollah.run returns the pcgts now;

---
 sbb_newspapers_org_image/cli.py      |  9 +++++----
 sbb_newspapers_org_image/eynollah.py | 22 ++++++++++++----------
 2 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/sbb_newspapers_org_image/cli.py b/sbb_newspapers_org_image/cli.py
index 10bf5e8..a302ff5 100644
--- a/sbb_newspapers_org_image/cli.py
+++ b/sbb_newspapers_org_image/cli.py
@@ -1,6 +1,6 @@
 import click
 from ocrd_utils import initLogging, setOverrideLogLevel
-from sbb_newspapers_org_image.eynollah import eynollah
+from sbb_newspapers_org_image.eynollah import Eynollah
 
 
 @click.command()
@@ -109,7 +109,7 @@ def main(
     if log_level:
         setOverrideLogLevel(log_level)
     initLogging()
-    eynollah(
+    eynollah = Eynollah(
         image,
         None,
         out,
@@ -124,8 +124,9 @@ def main(
         full_layout,
         allow_scaling,
         headers_off,
-    ).run()
-
+    )
+    pcgts = eynollah.run()
+    eynollah.write_pagexml(pcgts)
 
 if __name__ == "__main__":
     main()
diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index 258bedf..e19bd1c 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -88,7 +88,7 @@ DPI_THRESHOLD = 298
 MAX_SLOPE = 999
 KERNEL = np.ones((5, 5), np.uint8)
 
-class eynollah:
+class Eynollah:
     def __init__(
         self,
         image_filename,
@@ -1149,7 +1149,7 @@ class eynollah:
             coord.set('points',points_co)
         return id_indexer_l
 
-    def write_into_page_xml(self, pcgts):
+    def write_pagexml(self, pcgts):
         self.logger.info("filename stem: '%s'", self.image_filename_stem)
         tree = ET.ElementTree(pcgts)
         tree.write(os.path.join(self.dir_out, self.image_filename_stem) + ".xml")
@@ -1826,9 +1826,9 @@ class eynollah:
         """
         self.logger.debug("enter run")
 
-        t1 = time.time()
+        t0 = time.time()
         img_res, is_image_enhanced, num_col_classifier, num_column_is_classified = self.run_enhancement()
-        self.logger.info("Enhancing took %ss ", str(time.time() - t1))
+        self.logger.info("Enhancing took %ss ", str(time.time() - t0))
 
         t1 = time.time()
         text_regions_p_1 = self.get_regions_from_xy_2models(img_res, is_image_enhanced)
@@ -1841,9 +1841,9 @@ class eynollah:
 
         if not num_col:
             self.logger.info("No columns detected, outputting an empty PAGE-XML")
-            self.write_into_page_xml(self.build_pagexml_no_full_layout([], page_coord, [], [], [], [], [], [], [], [], []))
+            pcgts = self.build_pagexml_no_full_layout([], page_coord, [], [], [], [], [], [], [], [], [])
             self.logger.info("Job done in %ss", str(time.time() - t1))
-            return
+            return pcgts
 
         t1 = time.time()
         textline_mask_tot_ea = self.run_textline(image_page)
@@ -2040,8 +2040,9 @@ class eynollah:
             else:
                 order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered, boxes_d, textline_mask_tot_d)
 
-            self.write_into_page_xml(self.build_pagexml_full_layout(contours_only_text_parent, contours_only_text_parent_h, page_coord, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, polygons_of_images, polygons_of_tabels, polygons_of_drop_capitals, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes))
-
+            pcgts = self.build_pagexml_full_layout(contours_only_text_parent, contours_only_text_parent_h, page_coord, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, polygons_of_images, polygons_of_tabels, polygons_of_drop_capitals, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes)
+            self.logger.info("Job done in %ss", str(time.time() - t0))
+            return pcgts
         else:
             contours_only_text_parent_h = None
             if np.abs(slope_deskew) < SLOPE_THRESHOLD:
@@ -2049,6 +2050,7 @@ class eynollah:
             else:
                 contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered)[index_by_text_par_con])
                 order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h, boxes_d, textline_mask_tot_d)
-            self.write_into_page_xml(self.build_pagexml_no_full_layout(txt_con_org, page_coord, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes))
+            pcgts = self.build_pagexml_no_full_layout(txt_con_org, page_coord, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes)
+            self.logger.info("Job done in %ss", str(time.time() - t0))
+            return pcgts
 
-        self.logger.info("Job done in %ss", str(time.time() - t1))

From 22184024fad35e3e2019152da19d03d93a63861a Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Wed, 24 Feb 2021 16:36:03 +0100
Subject: [PATCH 83/89] cli: fail early with inconsistent plotting options

---
 sbb_newspapers_org_image/cli.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/sbb_newspapers_org_image/cli.py b/sbb_newspapers_org_image/cli.py
index a302ff5..039bb23 100644
--- a/sbb_newspapers_org_image/cli.py
+++ b/sbb_newspapers_org_image/cli.py
@@ -1,3 +1,4 @@
+import sys
 import click
 from ocrd_utils import initLogging, setOverrideLogLevel
 from sbb_newspapers_org_image.eynollah import Eynollah
@@ -109,6 +110,12 @@ def main(
     if log_level:
         setOverrideLogLevel(log_level)
     initLogging()
+    if not enable_plotting and (save_layout or save_deskewed or save_all or save_images):
+        print("Error: You used one of -sl, -sd, -sa or -si but did not enable plotting with -ep")
+        sys.exit(1)
+    elif enable_plotting and not (save_layout or save_deskewed or save_all or save_images):
+        print("Error: You used -ep to enable plotting but set none of -sl, -sd, -sa or -si")
+        sys.exit(1)
     eynollah = Eynollah(
         image,
         None,

From c730e2eefd186f3ee39bff35390330807696d466 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Wed, 24 Feb 2021 17:14:34 +0100
Subject: [PATCH 84/89] move all XML handling to EynollahXmlWriter

---
 sbb_newspapers_org_image/cli.py      |   2 +-
 sbb_newspapers_org_image/eynollah.py | 308 ++++-----------------------
 sbb_newspapers_org_image/writer.py   | 261 +++++++++++++++++++++++
 3 files changed, 300 insertions(+), 271 deletions(-)
 create mode 100644 sbb_newspapers_org_image/writer.py

diff --git a/sbb_newspapers_org_image/cli.py b/sbb_newspapers_org_image/cli.py
index 039bb23..c18555d 100644
--- a/sbb_newspapers_org_image/cli.py
+++ b/sbb_newspapers_org_image/cli.py
@@ -133,7 +133,7 @@ def main(
         headers_off,
     )
     pcgts = eynollah.run()
-    eynollah.write_pagexml(pcgts)
+    eynollah.writer.write_pagexml(pcgts)
 
 if __name__ == "__main__":
     main()
diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index e19bd1c..21747b4 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -14,7 +14,6 @@ import warnings
 from pathlib import Path
 from multiprocessing import Process, Queue, cpu_count
 
-from lxml import etree as ET
 from ocrd_utils import getLogger
 import cv2
 import numpy as np
@@ -41,28 +40,19 @@ from .utils.contour import (
     return_contours_of_interested_textline,
     return_parent_contours,
 )
-
 from .utils.rotate import (
     rotate_image,
     rotation_not_90_func,
-    rotation_not_90_func_full_layout
-)
-
+    rotation_not_90_func_full_layout)
 from .utils.separate_lines import (
     textline_contours_postprocessing,
     seperate_lines_new2,
-    return_deskew_slop,
-)
-
+    return_deskew_slop)
 from .utils.drop_capitals import (
     adhere_drop_capital_region_into_cprresponding_textline,
-    filter_small_drop_capitals_from_no_patch_layout
-)
-
+    filter_small_drop_capitals_from_no_patch_layout)
 from .utils.marginals import get_marginals
-
 from .utils.resize import resize_image
-
 from .utils import (
     boosting_headers_by_longshot_region_segmentation,
     crop_image_inside_box,
@@ -75,12 +65,10 @@ from .utils import (
     order_and_id_of_texts,
     order_of_regions,
     find_number_of_columns_in_document,
-    return_boxes_of_images_by_order_of_reading_new,
-)
-
-from .utils.xml import create_page_xml, add_textequiv, xml_reading_order
+    return_boxes_of_images_by_order_of_reading_new)
 from .utils.pil_cv2 import check_dpi
 from .plot import EynollahPlotter
+from .writer import EynollahXmlWriter
 
 SLOPE_THRESHOLD = 0.13
 RATIO_OF_TWO_MODEL_THRESHOLD = 95.50 #98.45:
@@ -107,7 +95,6 @@ class Eynollah:
         headers_off=False
     ):
         self.image_filename = image_filename
-        self.cont_page = []
         self.dir_out = dir_out
         self.image_filename_stem = image_filename_stem
         self.allow_enhancement = allow_enhancement
@@ -123,8 +110,11 @@ class Eynollah:
             dir_of_cropped_images=dir_of_cropped_images,
             dir_of_layout=dir_of_layout,
             image_filename=image_filename,
-            image_filename_stem=image_filename_stem,
-        )
+            image_filename_stem=image_filename_stem)
+        self.writer = EynollahXmlWriter(
+            dir_out=self.dir_out,
+            image_filename=self.image_filename,
+            curved_line=self.curved_line)
         self.logger = getLogger('eynollah')
         self.dir_models = dir_models
 
@@ -401,12 +391,16 @@ class Eynollah:
         self.image = resize_image(self.image, self.img_hight_int, self.img_width_int)
 
         # Also set for the plotter
-        # XXX TODO hacky
         if self.plotter:
             self.plotter.image_org = self.image_org
             self.plotter.scale_y = self.scale_y
             self.plotter.scale_x = self.scale_x
-
+        # Also set for the writer
+        self.writer.image_org = self.image_org
+        self.writer.scale_y = self.scale_y
+        self.writer.scale_x = self.scale_x
+        self.writer.height_org = self.height_org
+        self.writer.width_org = self.width_org
 
     def get_image_and_scales_after_enhancing(self, img_org, img_res):
         self.logger.debug("enter get_image_and_scales_after_enhancing")
@@ -419,6 +413,18 @@ class Eynollah:
         self.scale_y = img_res.shape[0] / float(self.image_org.shape[0])
         self.scale_x = img_res.shape[1] / float(self.image_org.shape[1])
 
+        # Also set for the plotter
+        if self.plotter:
+            self.plotter.image_org = self.image_org
+            self.plotter.scale_y = self.scale_y
+            self.plotter.scale_x = self.scale_x
+        # Also set for the writer
+        self.writer.image_org = self.image_org
+        self.writer.scale_y = self.scale_y
+        self.writer.scale_x = self.scale_x
+        self.writer.height_org = self.height_org
+        self.writer.width_org = self.width_org
+
     def start_new_session_and_model(self, model_dir):
         self.logger.debug("enter start_new_session_and_model (model_dir=%s)", model_dir)
         config = tf.ConfigProto()
@@ -570,6 +576,7 @@ class Eynollah:
 
     def extract_page(self):
         self.logger.debug("enter extract_page")
+        cont_page = []
         model_page, session_page = self.start_new_session_and_model(self.model_page_dir)
         img = cv2.GaussianBlur(self.image, (5, 5), 0)
         img_page_prediction = self.do_prediction(False, img, model_page)
@@ -593,12 +600,12 @@ class Eynollah:
 
         box = [x, y, w, h]
         croped_page, page_coord = crop_image_inside_box(box, self.image)
-        self.cont_page.append(np.array([[page_coord[2], page_coord[0]], [page_coord[3], page_coord[0]], [page_coord[3], page_coord[1]], [page_coord[2], page_coord[1]]]))
+        cont_page.append(np.array([[page_coord[2], page_coord[0]], [page_coord[3], page_coord[0]], [page_coord[3], page_coord[1]], [page_coord[2], page_coord[1]]]))
         session_page.close()
 
         K.clear_session()
         self.logger.debug("exit extract_page")
-        return croped_page, page_coord
+        return croped_page, page_coord, cont_page
 
     def extract_text_regions(self, img, patches, cols):
         self.logger.debug("enter extract_text_regions")
@@ -1038,245 +1045,6 @@ class Eynollah:
         poly.put(poly_sub)
         box_sub.put(boxes_sub_new)
 
-    def calculate_polygon_coords(self, contour_list, i, page_coord):
-        self.logger.debug('enter calculate_polygon_coords')
-        coords = ''
-        for j in range(len(contour_list[i])):
-            if len(contour_list[i][j]) == 2:
-                coords += str(int((contour_list[i][j][0] + page_coord[2]) / self.scale_x))
-                coords += ','
-                coords += str(int((contour_list[i][j][1] + page_coord[0]) / self.scale_y))
-            else:
-                coords += str(int((contour_list[i][j][0][0] + page_coord[2]) / self.scale_x))
-                coords += ','
-                coords += str(int((contour_list[i][j][0][1] + page_coord[0]) / self.scale_y))
-
-            if j < len(contour_list[i]) - 1:
-                coords=coords + ' '
-        #print(coords)
-        return coords
-
-    def calculate_page_coords(self):
-        self.logger.debug('enter calculate_page_coords')
-        points_page_print = ""
-        for _, contour in enumerate(self.cont_page[0]):
-            if len(contour) == 2:
-                points_page_print += str(int((contour[0]) / self.scale_x))
-                points_page_print += ','
-                points_page_print += str(int((contour[1]) / self.scale_y))
-            else:
-                points_page_print += str(int((contour[0][0]) / self.scale_x))
-                points_page_print += ','
-                points_page_print += str(int((contour[0][1] ) / self.scale_y))
-            points_page_print = points_page_print + ' '
-        return points_page_print[:-1]
-
-    def serialize_lines_in_marginal(self, marginal, all_found_texline_polygons_marginals, marginal_idx, page_coord, all_box_coord_marginals, id_indexer_l):
-        for j in range(len(all_found_texline_polygons_marginals[marginal_idx])):
-            textline = ET.SubElement(marginal, 'TextLine')
-            textline.set('id', 'l%s' % id_indexer_l)
-            id_indexer_l += 1
-            coord = ET.SubElement(textline, 'Coords')
-            add_textequiv(textline)
-            points_co = ''
-            for l in range(len(all_found_texline_polygons_marginals[marginal_idx][j])):
-                if not self.curved_line:
-                    if len(all_found_texline_polygons_marginals[marginal_idx][j][l]) == 2:
-                        points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x))
-                        points_co += ','
-                        points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0]) / self.scale_y))
-                    else:
-                        points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x))
-                        points_co += ','
-                        points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0])/self.scale_y))
-                else:
-                    if len(all_found_texline_polygons_marginals[marginal_idx][j][l]) == 2:
-                        points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0] + page_coord[2]) / self.scale_x))
-                        points_co += ','
-                        points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][1] + page_coord[0]) / self.scale_y))
-                    else:
-                        points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][0] + page_coord[2]) / self.scale_x))
-                        points_co += ','
-                        points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][1] + page_coord[0]) / self.scale_y))
-                if l < len(all_found_texline_polygons_marginals[marginal_idx][j]) - 1:
-                    points_co += ' '
-            coord.set('points',points_co)
-        return id_indexer_l
-
-    def serialize_lines_in_region(self, textregion, all_found_texline_polygons, region_idx, page_coord, all_box_coord, slopes, id_indexer_l):
-        self.logger.debug('enter serialize_lines_in_region')
-        for j in range(len(all_found_texline_polygons[region_idx])):
-            textline = ET.SubElement(textregion, 'TextLine')
-            textline.set('id', 'l%s' % id_indexer_l)
-            id_indexer_l += 1
-            coord = ET.SubElement(textline, 'Coords')
-            add_textequiv(textline)
-
-            points_co = ''
-            for l in range(len(all_found_texline_polygons[region_idx][j])):
-                if not self.curved_line:
-                    if len(all_found_texline_polygons[region_idx][j][l])==2:
-                        textline_x_coord = max(0, int((all_found_texline_polygons[region_idx][j][l][0] + all_box_coord[region_idx][2] + page_coord[2]) / self.scale_x))
-                        textline_y_coord = max(0, int((all_found_texline_polygons[region_idx][j][l][1] + all_box_coord[region_idx][0] + page_coord[0]) / self.scale_y))
-                    else:
-                        textline_x_coord = max(0, int((all_found_texline_polygons[region_idx][j][l][0][0] + all_box_coord[region_idx][2] + page_coord[2]) / self.scale_x))
-                        textline_y_coord = max(0, int((all_found_texline_polygons[region_idx][j][l][0][1] + all_box_coord[region_idx][0] + page_coord[0]) / self.scale_y))
-                    points_co += str(textline_x_coord)
-                    points_co += ','
-                    points_co += str(textline_y_coord)
-
-                if self.curved_line and np.abs(slopes[region_idx]) <= 45:
-                    if len(all_found_texline_polygons[region_idx][j][l]) == 2:
-                        points_co += str(int((all_found_texline_polygons[region_idx][j][l][0] + page_coord[2]) / self.scale_x))
-                        points_co += ','
-                        points_co += str(int((all_found_texline_polygons[region_idx][j][l][1] + page_coord[0]) / self.scale_y))
-                    else:
-                        points_co += str(int((all_found_texline_polygons[region_idx][j][l][0][0] + page_coord[2]) / self.scale_x))
-                        points_co += ','
-                        points_co += str(int((all_found_texline_polygons[region_idx][j][l][0][1] + page_coord[0])/self.scale_y))
-                elif self.curved_line and np.abs(slopes[region_idx]) > 45:
-                    if len(all_found_texline_polygons[region_idx][j][l])==2:
-                        points_co += str(int((all_found_texline_polygons[region_idx][j][l][0] + all_box_coord[region_idx][2]+page_coord[2])/self.scale_x))
-                        points_co += ','
-                        points_co += str(int((all_found_texline_polygons[region_idx][j][l][1] + all_box_coord[region_idx][0]+page_coord[0])/self.scale_y))
-                    else:
-                        points_co += str(int((all_found_texline_polygons[region_idx][j][l][0][0] + all_box_coord[region_idx][2]+page_coord[2])/self.scale_x))
-                        points_co += ','
-                        points_co += str(int((all_found_texline_polygons[region_idx][j][l][0][1] + all_box_coord[region_idx][0]+page_coord[0])/self.scale_y))
-
-                if l < len(all_found_texline_polygons[region_idx][j]) - 1:
-                    points_co += ' '
-            coord.set('points',points_co)
-        return id_indexer_l
-
-    def write_pagexml(self, pcgts):
-        self.logger.info("filename stem: '%s'", self.image_filename_stem)
-        tree = ET.ElementTree(pcgts)
-        tree.write(os.path.join(self.dir_out, self.image_filename_stem) + ".xml")
-
-    def build_pagexml_no_full_layout(self, found_polygons_text_region, page_coord, order_of_texts, id_of_texts, all_found_texline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes):
-        self.logger.debug('enter build_pagexml_no_full_layout')
-
-        # create the file structure
-        pcgts, page = create_page_xml(self.image_filename, self.height_org, self.width_org)
-        page_print_sub = ET.SubElement(page, "Border")
-        coord_page = ET.SubElement(page_print_sub, "Coords")
-        coord_page.set('points', self.calculate_page_coords())
-
-        id_of_marginalia = []
-        id_indexer = 0
-        id_indexer_l = 0
-        if len(found_polygons_text_region) > 0:
-            xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia, found_polygons_marginals)
-            for mm in range(len(found_polygons_text_region)):
-                textregion = ET.SubElement(page, 'TextRegion')
-                textregion.set('id', 'r%s' % id_indexer)
-                id_indexer += 1
-                textregion.set('type', 'paragraph')
-                coord_text = ET.SubElement(textregion, 'Coords')
-                coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region, mm, page_coord))
-                id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, id_indexer_l)
-                add_textequiv(textregion)
-
-        for marginal_idx in range(len(found_polygons_marginals)):
-            marginal = ET.SubElement(page, 'TextRegion')
-            marginal.set('id', id_of_marginalia[mm])
-            marginal.set('type', 'marginalia')
-            coord_text = ET.SubElement(marginal, 'Coords')
-            coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals, mm, page_coord))
-            self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, marginal_idx, page_coord, all_box_coord_marginals, id_indexer_l)
-
-        id_indexer = len(found_polygons_text_region) + len(found_polygons_marginals)
-        for mm in range(len(found_polygons_text_region_img)):
-            textregion = ET.SubElement(page, 'ImageRegion')
-            textregion.set('id', 'r%s' % id_indexer)
-            id_indexer += 1
-            coord_text = ET.SubElement(textregion, 'Coords')
-            points_co = ''
-            for lmm in range(len(found_polygons_text_region_img[mm])):
-                points_co += str(int((found_polygons_text_region_img[mm][lmm,0,0] + page_coord[2]) / self.scale_x))
-                points_co += ','
-                points_co += str(int((found_polygons_text_region_img[mm][lmm,0,1] + page_coord[0]) / self.scale_y))
-                if lmm < len(found_polygons_text_region_img[mm]) - 1:
-                    points_co += ' '
-            coord_text.set('points', points_co)
-
-        return pcgts
-
-    def build_pagexml_full_layout(self, found_polygons_text_region, found_polygons_text_region_h, page_coord, order_of_texts, id_of_texts, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes):
-        self.logger.debug('enter build_pagexml_full_layout')
-
-        # create the file structure
-        pcgts, page = create_page_xml(self.image_filename, self.height_org, self.width_org)
-        page_print_sub = ET.SubElement(page, "Border")
-        coord_page = ET.SubElement(page_print_sub, "Coords")
-        coord_page.set('points', self.calculate_page_coords())
-
-        id_indexer = 0
-        id_indexer_l = 0
-        id_of_marginalia = []
-
-        if len(found_polygons_text_region) > 0:
-            xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia, found_polygons_marginals)
-            for mm in range(len(found_polygons_text_region)):
-                textregion=ET.SubElement(page, 'TextRegion')
-                textregion.set('id', 'r%s' % id_indexer)
-                id_indexer += 1
-                textregion.set('type', 'paragraph')
-                coord_text = ET.SubElement(textregion, 'Coords')
-                coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region, mm, page_coord))
-                id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, id_indexer_l)
-                add_textequiv(textregion)
-
-        self.logger.debug('len(found_polygons_text_region_h) %s', len(found_polygons_text_region_h))
-        if len(found_polygons_text_region_h) > 0:
-            for mm in range(len(found_polygons_text_region_h)):
-                textregion=ET.SubElement(page, 'TextRegion')
-                textregion.set('id', 'r%s' % id_indexer)
-                id_indexer += 1
-                textregion.set('type','header')
-                coord_text = ET.SubElement(textregion, 'Coords')
-                coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_h, mm, page_coord))
-                id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons_h, mm, page_coord, all_box_coord_h, slopes, id_indexer_l)
-                add_textequiv(textregion)
-
-        if len(found_polygons_drop_capitals) > 0:
-            id_indexer = len(found_polygons_text_region) + len(found_polygons_text_region_h) + len(found_polygons_marginals)
-            for mm in range(len(found_polygons_drop_capitals)):
-                textregion=ET.SubElement(page, 'TextRegion')
-                textregion.set('id',' r%s' % id_indexer)
-                id_indexer += 1
-                textregion.set('type', 'drop-capital')
-                coord_text = ET.SubElement(textregion, 'Coords')
-                coord_text.set('points', self.calculate_polygon_coords(found_polygons_drop_capitals, mm, page_coord))
-                add_textequiv(textregion)
-
-        for marginal_idx in range(len(found_polygons_marginals)):
-            marginal = ET.SubElement(page, 'TextRegion')
-            add_textequiv(textregion)
-            marginal.set('id', id_of_marginalia[mm])
-            marginal.set('type', 'marginalia')
-            coord_text = ET.SubElement(marginal, 'Coords')
-            coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals, mm, page_coord))
-            self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, marginal_idx, page_coord, all_box_coord_marginals, id_indexer_l)
-
-        id_indexer = len(found_polygons_text_region) + len(found_polygons_text_region_h) + len(found_polygons_marginals) + len(found_polygons_drop_capitals)
-        for mm in range(len(found_polygons_text_region_img)):
-            textregion=ET.SubElement(page, 'ImageRegion')
-            textregion.set('id', 'r%s' % id_indexer)
-            id_indexer += 1
-            coord_text = ET.SubElement(textregion, 'Coords')
-            coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_img, mm, page_coord))
-
-        for mm in range(len(found_polygons_tables)):
-            textregion = ET.SubElement(page, 'TableRegion')
-            textregion.set('id', 'r%s' %id_indexer)
-            id_indexer += 1
-            coord_text = ET.SubElement(textregion, 'Coords')
-            coord_text.set('points', self.calculate_polygon_coords(found_polygons_tables, mm, page_coord))
-
-        return pcgts
-
     def get_regions_from_xy_2models(self,img,is_image_enhanced):
         self.logger.debug("enter get_regions_from_xy_2models")
         img_org = np.copy(img)
@@ -1621,7 +1389,7 @@ class Eynollah:
         img_g3[:, :, 1] = img_g[:, :]
         img_g3[:, :, 2] = img_g[:, :]
 
-        image_page, page_coord = self.extract_page()
+        image_page, page_coord, cont_page = self.extract_page()
         if self.plotter:
             self.plotter.save_page_image(image_page)
 
@@ -1642,7 +1410,7 @@ class Eynollah:
         except Exception as why:
             self.logger.error(why)
             num_col = None
-        return num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, text_regions_p_1
+        return num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, text_regions_p_1, cont_page
 
     def run_enhancement(self):
         self.logger.info("resize and enhance image")
@@ -1835,13 +1603,14 @@ class Eynollah:
         self.logger.info("Textregion detection took %ss ", str(time.time() - t1))
 
         t1 = time.time()
-        num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, text_regions_p_1 = \
+        num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines, text_regions_p_1, cont_page = \
                 self.run_graphics_and_columns(text_regions_p_1, num_col_classifier, num_column_is_classified)
         self.logger.info("Graphics detection took %ss ", str(time.time() - t1))
+        self.logger.info('cont_page %s', cont_page)
 
         if not num_col:
             self.logger.info("No columns detected, outputting an empty PAGE-XML")
-            pcgts = self.build_pagexml_no_full_layout([], page_coord, [], [], [], [], [], [], [], [], [])
+            pcgts = self.writer.build_pagexml_no_full_layout([], page_coord, [], [], [], [], [], [], [], [], [], cont_page)
             self.logger.info("Job done in %ss", str(time.time() - t1))
             return pcgts
 
@@ -2040,7 +1809,7 @@ class Eynollah:
             else:
                 order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered, boxes_d, textline_mask_tot_d)
 
-            pcgts = self.build_pagexml_full_layout(contours_only_text_parent, contours_only_text_parent_h, page_coord, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, polygons_of_images, polygons_of_tabels, polygons_of_drop_capitals, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes)
+            pcgts = self.writer.build_pagexml_full_layout(contours_only_text_parent, contours_only_text_parent_h, page_coord, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, polygons_of_images, polygons_of_tabels, polygons_of_drop_capitals, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, cont_page)
             self.logger.info("Job done in %ss", str(time.time() - t0))
             return pcgts
         else:
@@ -2050,7 +1819,6 @@ class Eynollah:
             else:
                 contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered)[index_by_text_par_con])
                 order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h, boxes_d, textline_mask_tot_d)
-            pcgts = self.build_pagexml_no_full_layout(txt_con_org, page_coord, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes)
+            pcgts = self.writer.build_pagexml_no_full_layout(txt_con_org, page_coord, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, cont_page)
             self.logger.info("Job done in %ss", str(time.time() - t0))
             return pcgts
-
diff --git a/sbb_newspapers_org_image/writer.py b/sbb_newspapers_org_image/writer.py
new file mode 100644
index 0000000..a949322
--- /dev/null
+++ b/sbb_newspapers_org_image/writer.py
@@ -0,0 +1,261 @@
+# pylint: disable=too-many-locals,wrong-import-position,too-many-lines,too-many-statements,chained-comparison,fixme,broad-except,c-extension-no-member
+from pathlib import Path
+import os.path
+
+from .utils.xml import create_page_xml, add_textequiv, xml_reading_order
+
+from ocrd_utils import getLogger
+from lxml import etree as ET
+import numpy as np
+
+class EynollahXmlWriter():
+
+    def __init__(self, *, dir_out, image_filename, curved_line):
+        self.logger = getLogger('eynollah.writer')
+        self.dir_out = dir_out
+        self.image_filename = image_filename
+        self.image_filename_stem = Path(Path(image_filename).name).stem
+        self.curved_line = curved_line
+        self.scale_x = None # XXX set outside __init__
+        self.scale_y = None # XXX set outside __init__
+        self.height_org = None # XXX set outside __init__
+        self.width_org = None # XXX set outside __init__
+
+    def calculate_page_coords(self, cont_page):
+        self.logger.debug('enter calculate_page_coords')
+        points_page_print = ""
+        for _, contour in enumerate(cont_page[0]):
+            if len(contour) == 2:
+                points_page_print += str(int((contour[0]) / self.scale_x))
+                points_page_print += ','
+                points_page_print += str(int((contour[1]) / self.scale_y))
+            else:
+                points_page_print += str(int((contour[0][0]) / self.scale_x))
+                points_page_print += ','
+                points_page_print += str(int((contour[0][1] ) / self.scale_y))
+            points_page_print = points_page_print + ' '
+        return points_page_print[:-1]
+
+    def serialize_lines_in_marginal(self, marginal, all_found_texline_polygons_marginals, marginal_idx, page_coord, all_box_coord_marginals, id_indexer_l):
+        for j in range(len(all_found_texline_polygons_marginals[marginal_idx])):
+            textline = ET.SubElement(marginal, 'TextLine')
+            textline.set('id', 'l%s' % id_indexer_l)
+            id_indexer_l += 1
+            coord = ET.SubElement(textline, 'Coords')
+            add_textequiv(textline)
+            points_co = ''
+            for l in range(len(all_found_texline_polygons_marginals[marginal_idx][j])):
+                if not self.curved_line:
+                    if len(all_found_texline_polygons_marginals[marginal_idx][j][l]) == 2:
+                        points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x))
+                        points_co += ','
+                        points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0]) / self.scale_y))
+                    else:
+                        points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x))
+                        points_co += ','
+                        points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0])/self.scale_y))
+                else:
+                    if len(all_found_texline_polygons_marginals[marginal_idx][j][l]) == 2:
+                        points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0] + page_coord[2]) / self.scale_x))
+                        points_co += ','
+                        points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][1] + page_coord[0]) / self.scale_y))
+                    else:
+                        points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][0] + page_coord[2]) / self.scale_x))
+                        points_co += ','
+                        points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][1] + page_coord[0]) / self.scale_y))
+                if l < len(all_found_texline_polygons_marginals[marginal_idx][j]) - 1:
+                    points_co += ' '
+            coord.set('points',points_co)
+        return id_indexer_l
+
+    def serialize_lines_in_region(self, textregion, all_found_texline_polygons, region_idx, page_coord, all_box_coord, slopes, id_indexer_l):
+        self.logger.debug('enter serialize_lines_in_region')
+        for j in range(len(all_found_texline_polygons[region_idx])):
+            textline = ET.SubElement(textregion, 'TextLine')
+            textline.set('id', 'l%s' % id_indexer_l)
+            id_indexer_l += 1
+            coord = ET.SubElement(textline, 'Coords')
+            add_textequiv(textline)
+
+            points_co = ''
+            for l in range(len(all_found_texline_polygons[region_idx][j])):
+                if not self.curved_line:
+                    if len(all_found_texline_polygons[region_idx][j][l])==2:
+                        textline_x_coord = max(0, int((all_found_texline_polygons[region_idx][j][l][0] + all_box_coord[region_idx][2] + page_coord[2]) / self.scale_x))
+                        textline_y_coord = max(0, int((all_found_texline_polygons[region_idx][j][l][1] + all_box_coord[region_idx][0] + page_coord[0]) / self.scale_y))
+                    else:
+                        textline_x_coord = max(0, int((all_found_texline_polygons[region_idx][j][l][0][0] + all_box_coord[region_idx][2] + page_coord[2]) / self.scale_x))
+                        textline_y_coord = max(0, int((all_found_texline_polygons[region_idx][j][l][0][1] + all_box_coord[region_idx][0] + page_coord[0]) / self.scale_y))
+                    points_co += str(textline_x_coord)
+                    points_co += ','
+                    points_co += str(textline_y_coord)
+
+                if self.curved_line and np.abs(slopes[region_idx]) <= 45:
+                    if len(all_found_texline_polygons[region_idx][j][l]) == 2:
+                        points_co += str(int((all_found_texline_polygons[region_idx][j][l][0] + page_coord[2]) / self.scale_x))
+                        points_co += ','
+                        points_co += str(int((all_found_texline_polygons[region_idx][j][l][1] + page_coord[0]) / self.scale_y))
+                    else:
+                        points_co += str(int((all_found_texline_polygons[region_idx][j][l][0][0] + page_coord[2]) / self.scale_x))
+                        points_co += ','
+                        points_co += str(int((all_found_texline_polygons[region_idx][j][l][0][1] + page_coord[0])/self.scale_y))
+                elif self.curved_line and np.abs(slopes[region_idx]) > 45:
+                    if len(all_found_texline_polygons[region_idx][j][l])==2:
+                        points_co += str(int((all_found_texline_polygons[region_idx][j][l][0] + all_box_coord[region_idx][2]+page_coord[2])/self.scale_x))
+                        points_co += ','
+                        points_co += str(int((all_found_texline_polygons[region_idx][j][l][1] + all_box_coord[region_idx][0]+page_coord[0])/self.scale_y))
+                    else:
+                        points_co += str(int((all_found_texline_polygons[region_idx][j][l][0][0] + all_box_coord[region_idx][2]+page_coord[2])/self.scale_x))
+                        points_co += ','
+                        points_co += str(int((all_found_texline_polygons[region_idx][j][l][0][1] + all_box_coord[region_idx][0]+page_coord[0])/self.scale_y))
+
+                if l < len(all_found_texline_polygons[region_idx][j]) - 1:
+                    points_co += ' '
+            coord.set('points',points_co)
+        return id_indexer_l
+
+    def write_pagexml(self, pcgts):
+        self.logger.info("filename stem: '%s'", self.image_filename_stem)
+        tree = ET.ElementTree(pcgts)
+        tree.write(os.path.join(self.dir_out, self.image_filename_stem) + ".xml")
+
+    def build_pagexml_no_full_layout(self, found_polygons_text_region, page_coord, order_of_texts, id_of_texts, all_found_texline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, cont_page):
+        self.logger.debug('enter build_pagexml_no_full_layout')
+
+        # create the file structure
+        pcgts, page = create_page_xml(self.image_filename, self.height_org, self.width_org)
+        page_print_sub = ET.SubElement(page, "Border")
+        coord_page = ET.SubElement(page_print_sub, "Coords")
+        coord_page.set('points', self.calculate_page_coords(cont_page))
+
+        id_of_marginalia = []
+        id_indexer = 0
+        id_indexer_l = 0
+        if len(found_polygons_text_region) > 0:
+            xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia, found_polygons_marginals)
+            for mm in range(len(found_polygons_text_region)):
+                textregion = ET.SubElement(page, 'TextRegion')
+                textregion.set('id', 'r%s' % id_indexer)
+                id_indexer += 1
+                textregion.set('type', 'paragraph')
+                coord_text = ET.SubElement(textregion, 'Coords')
+                coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region, mm, page_coord))
+                id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, id_indexer_l)
+                add_textequiv(textregion)
+
+        for marginal_idx in range(len(found_polygons_marginals)):
+            marginal = ET.SubElement(page, 'TextRegion')
+            marginal.set('id', id_of_marginalia[mm])
+            marginal.set('type', 'marginalia')
+            coord_text = ET.SubElement(marginal, 'Coords')
+            coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals, mm, page_coord))
+            self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, marginal_idx, page_coord, all_box_coord_marginals, id_indexer_l)
+
+        id_indexer = len(found_polygons_text_region) + len(found_polygons_marginals)
+        for mm in range(len(found_polygons_text_region_img)):
+            textregion = ET.SubElement(page, 'ImageRegion')
+            textregion.set('id', 'r%s' % id_indexer)
+            id_indexer += 1
+            coord_text = ET.SubElement(textregion, 'Coords')
+            points_co = ''
+            for lmm in range(len(found_polygons_text_region_img[mm])):
+                points_co += str(int((found_polygons_text_region_img[mm][lmm,0,0] + page_coord[2]) / self.scale_x))
+                points_co += ','
+                points_co += str(int((found_polygons_text_region_img[mm][lmm,0,1] + page_coord[0]) / self.scale_y))
+                if lmm < len(found_polygons_text_region_img[mm]) - 1:
+                    points_co += ' '
+            coord_text.set('points', points_co)
+
+        return pcgts
+
+    def build_pagexml_full_layout(self, found_polygons_text_region, found_polygons_text_region_h, page_coord, order_of_texts, id_of_texts, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, cont_page):
+        self.logger.debug('enter build_pagexml_full_layout')
+
+        # create the file structure
+        pcgts, page = create_page_xml(self.image_filename, self.height_org, self.width_org)
+        page_print_sub = ET.SubElement(page, "Border")
+        coord_page = ET.SubElement(page_print_sub, "Coords")
+        coord_page.set('points', self.calculate_page_coords(cont_page))
+
+        id_indexer = 0
+        id_indexer_l = 0
+        id_of_marginalia = []
+
+        if len(found_polygons_text_region) > 0:
+            xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia, found_polygons_marginals)
+            for mm in range(len(found_polygons_text_region)):
+                textregion=ET.SubElement(page, 'TextRegion')
+                textregion.set('id', 'r%s' % id_indexer)
+                id_indexer += 1
+                textregion.set('type', 'paragraph')
+                coord_text = ET.SubElement(textregion, 'Coords')
+                coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region, mm, page_coord))
+                id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, id_indexer_l)
+                add_textequiv(textregion)
+
+        self.logger.debug('len(found_polygons_text_region_h) %s', len(found_polygons_text_region_h))
+        if len(found_polygons_text_region_h) > 0:
+            for mm in range(len(found_polygons_text_region_h)):
+                textregion=ET.SubElement(page, 'TextRegion')
+                textregion.set('id', 'r%s' % id_indexer)
+                id_indexer += 1
+                textregion.set('type','header')
+                coord_text = ET.SubElement(textregion, 'Coords')
+                coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_h, mm, page_coord))
+                id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons_h, mm, page_coord, all_box_coord_h, slopes, id_indexer_l)
+                add_textequiv(textregion)
+
+        if len(found_polygons_drop_capitals) > 0:
+            id_indexer = len(found_polygons_text_region) + len(found_polygons_text_region_h) + len(found_polygons_marginals)
+            for mm in range(len(found_polygons_drop_capitals)):
+                textregion=ET.SubElement(page, 'TextRegion')
+                textregion.set('id',' r%s' % id_indexer)
+                id_indexer += 1
+                textregion.set('type', 'drop-capital')
+                coord_text = ET.SubElement(textregion, 'Coords')
+                coord_text.set('points', self.calculate_polygon_coords(found_polygons_drop_capitals, mm, page_coord))
+                add_textequiv(textregion)
+
+        for marginal_idx in range(len(found_polygons_marginals)):
+            marginal = ET.SubElement(page, 'TextRegion')
+            add_textequiv(textregion)
+            marginal.set('id', id_of_marginalia[mm])
+            marginal.set('type', 'marginalia')
+            coord_text = ET.SubElement(marginal, 'Coords')
+            coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals, mm, page_coord))
+            self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, marginal_idx, page_coord, all_box_coord_marginals, id_indexer_l)
+
+        id_indexer = len(found_polygons_text_region) + len(found_polygons_text_region_h) + len(found_polygons_marginals) + len(found_polygons_drop_capitals)
+        for mm in range(len(found_polygons_text_region_img)):
+            textregion=ET.SubElement(page, 'ImageRegion')
+            textregion.set('id', 'r%s' % id_indexer)
+            id_indexer += 1
+            coord_text = ET.SubElement(textregion, 'Coords')
+            coord_text.set('points', self.calculate_polygon_coords(found_polygons_text_region_img, mm, page_coord))
+
+        for mm in range(len(found_polygons_tables)):
+            textregion = ET.SubElement(page, 'TableRegion')
+            textregion.set('id', 'r%s' %id_indexer)
+            id_indexer += 1
+            coord_text = ET.SubElement(textregion, 'Coords')
+            coord_text.set('points', self.calculate_polygon_coords(found_polygons_tables, mm, page_coord))
+
+        return pcgts
+
+    def calculate_polygon_coords(self, contour_list, i, page_coord):
+        self.logger.debug('enter calculate_polygon_coords')
+        coords = ''
+        for j in range(len(contour_list[i])):
+            if len(contour_list[i][j]) == 2:
+                coords += str(int((contour_list[i][j][0] + page_coord[2]) / self.scale_x))
+                coords += ','
+                coords += str(int((contour_list[i][j][1] + page_coord[0]) / self.scale_y))
+            else:
+                coords += str(int((contour_list[i][j][0][0] + page_coord[2]) / self.scale_x))
+                coords += ','
+                coords += str(int((contour_list[i][j][0][1] + page_coord[0]) / self.scale_y))
+
+            if j < len(contour_list[i]) - 1:
+                coords=coords + ' '
+        return coords
+

From 3d77b62f898858f3074f837f253e3d895e23b16f Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Wed, 24 Feb 2021 18:13:17 +0100
Subject: [PATCH 85/89] remove unused

---
 sbb_newspapers_org_image/unused.py | 3367 ----------------------------
 1 file changed, 3367 deletions(-)
 delete mode 100644 sbb_newspapers_org_image/unused.py

diff --git a/sbb_newspapers_org_image/unused.py b/sbb_newspapers_org_image/unused.py
deleted file mode 100644
index e3514b8..0000000
--- a/sbb_newspapers_org_image/unused.py
+++ /dev/null
@@ -1,3367 +0,0 @@
-"""
-Unused methods from eynollah
-"""
-
-import numpy as np
-from shapely import geometry
-import cv2
-
-def color_images_diva(seg, n_classes):
-    """
-    XXX unused
-    """
-    ann_u = range(n_classes)
-    if len(np.shape(seg)) == 3:
-        seg = seg[:, :, 0]
-
-    seg_img = np.zeros((np.shape(seg)[0], np.shape(seg)[1], 3)).astype(float)
-    # colors=sns.color_palette("hls", n_classes)
-    colors = [[1, 0, 0], [8, 0, 0], [2, 0, 0], [4, 0, 0]]
-
-    for c in ann_u:
-        c = int(c)
-        segl = seg == c
-        seg_img[:, :, 0][seg == c] = colors[c][0]  # segl*(colors[c][0])
-        seg_img[:, :, 1][seg == c] = colors[c][1]  # seg_img[:,:,1]=segl*(colors[c][1])
-        seg_img[:, :, 2][seg == c] = colors[c][2]  # seg_img[:,:,2]=segl*(colors[c][2])
-    return seg_img
-
-def find_polygons_size_filter(contours, median_area, scaler_up=1.2, scaler_down=0.8):
-    """
-    XXX unused
-    """
-    found_polygons_early = list()
-
-    for c in contours:
-        if len(c) < 3:  # A polygon cannot have less than 3 points
-            continue
-
-        polygon = geometry.Polygon([point[0] for point in c])
-        area = polygon.area
-        # Check that polygon has area greater than minimal area
-        if area >= median_area * scaler_down and area <= median_area * scaler_up:
-            found_polygons_early.append(np.array([point for point in polygon.exterior.coords], dtype=np.uint))
-    return found_polygons_early
-
-def resize_ann(seg_in, input_height, input_width):
-    """
-    XXX unused
-    """
-    return cv2.resize(seg_in, (input_width, input_height), interpolation=cv2.INTER_NEAREST)
-
-def get_one_hot(seg, input_height, input_width, n_classes):
-    seg = seg[:, :, 0]
-    seg_f = np.zeros((input_height, input_width, n_classes))
-    for j in range(n_classes):
-        seg_f[:, :, j] = (seg == j).astype(int)
-    return seg_f
-
-def color_images(seg, n_classes):
-    ann_u = range(n_classes)
-    if len(np.shape(seg)) == 3:
-        seg = seg[:, :, 0]
-
-    seg_img = np.zeros((np.shape(seg)[0], np.shape(seg)[1], 3)).astype(np.uint8)
-    colors = sns.color_palette("hls", n_classes)
-
-    for c in ann_u:
-        c = int(c)
-        segl = seg == c
-        seg_img[:, :, 0] = segl * c
-        seg_img[:, :, 1] = segl * c
-        seg_img[:, :, 2] = segl * c
-    return seg_img
-
-def cleaning_probs(probs, sigma):
-    # Smooth
-    if sigma > 0.0:
-        return cv2.GaussianBlur(probs, (int(3 * sigma) * 2 + 1, int(3 * sigma) * 2 + 1), sigma)
-    elif sigma == 0.0:
-        return cv2.fastNlMeansDenoising((probs * 255).astype(np.uint8), h=20) / 255
-    else:  # Negative sigma, do not do anything
-        return probs
-
-
-def early_deskewing_slope_calculation_based_on_lines(region_pre_p):
-    # lines are labels by 6 in this model
-    seperators_closeup = ((region_pre_p[:, :, :] == 6)) * 1
-
-    seperators_closeup = seperators_closeup.astype(np.uint8)
-    imgray = cv2.cvtColor(seperators_closeup, cv2.COLOR_BGR2GRAY)
-    ret, thresh = cv2.threshold(imgray, 0, 255, 0)
-
-    contours_lines, hierachy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
-    slope_lines, dist_x, x_min_main, x_max_main, cy_main, slope_lines_org, y_min_main, y_max_main, cx_main = find_features_of_lines(contours_lines)
-
-    slope_lines_org_hor = slope_lines_org[slope_lines == 0]
-    args = np.array(range(len(slope_lines)))
-    len_x = seperators_closeup.shape[1] / 4.0
-
-    args_hor = args[slope_lines == 0]
-    dist_x_hor = dist_x[slope_lines == 0]
-    x_min_main_hor = x_min_main[slope_lines == 0]
-    x_max_main_hor = x_max_main[slope_lines == 0]
-    cy_main_hor = cy_main[slope_lines == 0]
-
-    args_hor = args_hor[dist_x_hor >= len_x / 2.0]
-    x_max_main_hor = x_max_main_hor[dist_x_hor >= len_x / 2.0]
-    x_min_main_hor = x_min_main_hor[dist_x_hor >= len_x / 2.0]
-    cy_main_hor = cy_main_hor[dist_x_hor >= len_x / 2.0]
-    slope_lines_org_hor = slope_lines_org_hor[dist_x_hor >= len_x / 2.0]
-
-    slope_lines_org_hor = slope_lines_org_hor[np.abs(slope_lines_org_hor) < 1.2]
-    slope_mean_hor = np.mean(slope_lines_org_hor)
-
-    if np.abs(slope_mean_hor) > 1.2:
-        slope_mean_hor = 0
-
-    # deskewed_new=rotate_image(image_regions_eraly_p[:,:,:],slope_mean_hor)
-
-    args_ver = args[slope_lines == 1]
-    y_min_main_ver = y_min_main[slope_lines == 1]
-    y_max_main_ver = y_max_main[slope_lines == 1]
-    x_min_main_ver = x_min_main[slope_lines == 1]
-    x_max_main_ver = x_max_main[slope_lines == 1]
-    cx_main_ver = cx_main[slope_lines == 1]
-    dist_y_ver = y_max_main_ver - y_min_main_ver
-    len_y = seperators_closeup.shape[0] / 3.0
-
-    return slope_mean_hor, cx_main_ver, dist_y_ver
-
-def boosting_text_only_regions_by_header(textregion_pre_np, img_only_text):
-    result = ((img_only_text[:, :] == 1) | (textregion_pre_np[:, :, 0] == 2)) * 1
-    return result
-
-def return_rotated_contours(slope, img_patch):
-    dst = rotate_image(img_patch, slope)
-    dst = dst.astype(np.uint8)
-    dst = dst[:, :, 0]
-    dst[dst != 0] = 1
-
-    imgray = cv2.cvtColor(dst, cv2.COLOR_BGR2GRAY)
-    _, thresh = cv2.threshold(imgray, 0, 255, 0)
-    thresh = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel)
-    thresh = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)
-    contours, _ = cv2.findContours(thresh.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-    return contours
-
-def get_textlines_for_each_textregions(self, textline_mask_tot, boxes):
-    textline_mask_tot = cv2.erode(textline_mask_tot, self.kernel, iterations=1)
-    self.area_of_cropped = []
-    self.all_text_region_raw = []
-    for jk in range(len(boxes)):
-        crop_img, crop_coor = crop_image_inside_box(boxes[jk], np.repeat(textline_mask_tot[:, :, np.newaxis], 3, axis=2))
-        crop_img = crop_img.astype(np.uint8)
-        self.all_text_region_raw.append(crop_img[:, :, 0])
-        self.area_of_cropped.append(crop_img.shape[0] * crop_img.shape[1])
-
-def deskew_region_prediction(regions_prediction, slope):
-    image_regions_deskewd = np.zeros(regions_prediction[:, :].shape)
-    for ind in np.unique(regions_prediction[:, :]):
-        interest_reg = (regions_prediction[:, :] == ind) * 1
-        interest_reg = interest_reg.astype(np.uint8)
-        deskewed_new = rotate_image(interest_reg, slope)
-        deskewed_new = deskewed_new[:, :]
-        deskewed_new[deskewed_new != 0] = ind
-
-        image_regions_deskewd = image_regions_deskewd + deskewed_new
-    return image_regions_deskewd
-
-def deskew_erarly(textline_mask):
-    textline_mask_org = np.copy(textline_mask)
-    # print(textline_mask.shape,np.unique(textline_mask),'hizzzzz')
-    # slope_new=0#deskew_images(img_patch)
-
-    textline_mask = np.repeat(textline_mask[:, :, np.newaxis], 3, axis=2) * 255
-
-    textline_mask = textline_mask.astype(np.uint8)
-    kernel = np.ones((5, 5), np.uint8)
-
-    imgray = cv2.cvtColor(textline_mask, cv2.COLOR_BGR2GRAY)
-
-    ret, thresh = cv2.threshold(imgray, 0, 255, 0)
-
-    contours, hirarchy = cv2.findContours(thresh.copy(), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
-
-    # print(hirarchy)
-
-    commenst_contours = filter_contours_area_of_image(thresh, contours, hirarchy, max_area=0.01, min_area=0.003)
-    main_contours = filter_contours_area_of_image(thresh, contours, hirarchy, max_area=1, min_area=0.003)
-    interior_contours = filter_contours_area_of_image_interiors(thresh, contours, hirarchy, max_area=1, min_area=0)
-
-    img_comm = np.zeros(thresh.shape)
-    img_comm_in = cv2.fillPoly(img_comm, pts=main_contours, color=(255, 255, 255))
-    ###img_comm_in=cv2.fillPoly(img_comm, pts =interior_contours, color=(0,0,0))
-
-    img_comm_in = np.repeat(img_comm_in[:, :, np.newaxis], 3, axis=2)
-    img_comm_in = img_comm_in.astype(np.uint8)
-
-    imgray = cv2.cvtColor(img_comm_in, cv2.COLOR_BGR2GRAY)
-    ##imgray = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
-
-    ##mask = cv2.inRange(imgray, lower_blue, upper_blue)
-    ret, thresh = cv2.threshold(imgray, 0, 255, 0)
-    # print(np.unique(mask))
-    ##ret, thresh = cv2.threshold(imgray, 0, 255, 0)
-
-    ##plt.imshow(thresh)
-    ##plt.show()
-
-    contours, hirarchy = cv2.findContours(thresh.copy(), cv2.cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
-
-    areas = [cv2.contourArea(contours[jj]) for jj in range(len(contours))]
-
-    median_area = np.mean(areas)
-    contours_slope = contours  # self.find_polugons_size_filter(contours,median_area=median_area,scaler_up=100,scaler_down=0.5)
-
-    if len(contours_slope) > 0:
-        for jv in range(len(contours_slope)):
-            new_poly = list(contours_slope[jv])
-            if jv == 0:
-                merged_all = new_poly
-            else:
-                merged_all = merged_all + new_poly
-
-        merge = np.array(merged_all)
-
-        img_in = np.zeros(textline_mask.shape)
-        img_p_in = cv2.fillPoly(img_in, pts=[merge], color=(255, 255, 255))
-
-        ##plt.imshow(img_p_in)
-        ##plt.show()
-
-        rect = cv2.minAreaRect(merge)
-
-        box = cv2.boxPoints(rect)
-
-        box = np.int0(box)
-
-        indexes = [0, 1, 2, 3]
-        x_list = box[:, 0]
-        y_list = box[:, 1]
-
-        index_y_sort = np.argsort(y_list)
-
-        index_upper_left = index_y_sort[np.argmin(x_list[index_y_sort[0:2]])]
-        index_upper_right = index_y_sort[np.argmax(x_list[index_y_sort[0:2]])]
-
-        index_lower_left = index_y_sort[np.argmin(x_list[index_y_sort[2:]]) + 2]
-        index_lower_right = index_y_sort[np.argmax(x_list[index_y_sort[2:]]) + 2]
-
-        alpha1 = float(box[index_upper_right][1] - box[index_upper_left][1]) / (float(box[index_upper_right][0] - box[index_upper_left][0]))
-        alpha2 = float(box[index_lower_right][1] - box[index_lower_left][1]) / (float(box[index_lower_right][0] - box[index_lower_left][0]))
-
-        slope_true = (alpha1 + alpha2) / 2.0
-
-        # slope=0#slope_true/np.pi*180
-
-        # if abs(slope)>=1:
-        # slope=0
-
-        # dst=rotate_image(textline_mask,slope_true)
-        # dst=dst[:,:,0]
-        # dst[dst!=0]=1
-    image_regions_deskewd = np.zeros(textline_mask_org[:, :].shape)
-    for ind in np.unique(textline_mask_org[:, :]):
-        interest_reg = (textline_mask_org[:, :] == ind) * 1
-        interest_reg = interest_reg.astype(np.uint8)
-        deskewed_new = rotate_image(interest_reg, slope_true)
-        deskewed_new = deskewed_new[:, :]
-        deskewed_new[deskewed_new != 0] = ind
-
-        image_regions_deskewd = image_regions_deskewd + deskewed_new
-    return image_regions_deskewd, slope_true
-
-def get_all_image_patches_coordination(self, image_page):
-    self.all_box_coord = []
-    for jk in range(len(self.boxes)):
-        _, crop_coor = crop_image_inside_box(self.boxes[jk], image_page)
-        self.all_box_coord.append(crop_coor)
-
-def find_num_col_olddd(self, regions_without_seperators, sigma_, multiplier=3.8):
-    regions_without_seperators_0 = regions_without_seperators[:, :].sum(axis=1)
-
-    meda_n_updown = regions_without_seperators_0[len(regions_without_seperators_0) :: -1]
-
-    first_nonzero = next((i for i, x in enumerate(regions_without_seperators_0) if x), 0)
-    last_nonzero = next((i for i, x in enumerate(meda_n_updown) if x), 0)
-
-    last_nonzero = len(regions_without_seperators_0) - last_nonzero
-
-    y = regions_without_seperators_0  # [first_nonzero:last_nonzero]
-
-    y_help = np.zeros(len(y) + 20)
-
-    y_help[10 : len(y) + 10] = y
-
-    x = np.array(range(len(y)))
-
-    zneg_rev = -y_help + np.max(y_help)
-
-    zneg = np.zeros(len(zneg_rev) + 20)
-
-    zneg[10 : len(zneg_rev) + 10] = zneg_rev
-
-    z = gaussian_filter1d(y, sigma_)
-    zneg = gaussian_filter1d(zneg, sigma_)
-
-    peaks_neg, _ = find_peaks(zneg, height=0)
-    peaks, _ = find_peaks(z, height=0)
-
-    peaks_neg = peaks_neg - 10 - 10
-
-    last_nonzero = last_nonzero - 0  # 100
-    first_nonzero = first_nonzero + 0  # +100
-
-    peaks_neg = peaks_neg[(peaks_neg > first_nonzero) & (peaks_neg < last_nonzero)]
-
-    peaks = peaks[(peaks > 0.06 * regions_without_seperators.shape[1]) & (peaks < 0.94 * regions_without_seperators.shape[1])]
-
-    interest_pos = z[peaks]
-
-    interest_pos = interest_pos[interest_pos > 10]
-
-    interest_neg = z[peaks_neg]
-
-    if interest_neg[0] < 0.1:
-        interest_neg = interest_neg[1:]
-    if interest_neg[len(interest_neg) - 1] < 0.1:
-        interest_neg = interest_neg[: len(interest_neg) - 1]
-
-    min_peaks_pos = np.min(interest_pos)
-    min_peaks_neg = 0  # np.min(interest_neg)
-
-    dis_talaei = (min_peaks_pos - min_peaks_neg) / multiplier
-    grenze = min_peaks_pos - dis_talaei  # np.mean(y[peaks_neg[0]:peaks_neg[len(peaks_neg)-1]])-np.std(y[peaks_neg[0]:peaks_neg[len(peaks_neg)-1]])/2.0
-
-    interest_neg_fin = interest_neg  # [(interest_neg<grenze)]
-    peaks_neg_fin = peaks_neg  # [(interest_neg<grenze)]
-    interest_neg_fin = interest_neg  # [(interest_neg<grenze)]
-
-    num_col = (len(interest_neg_fin)) + 1
-
-    p_l = 0
-    p_u = len(y) - 1
-    p_m = int(len(y) / 2.0)
-    p_g_l = int(len(y) / 3.0)
-    p_g_u = len(y) - int(len(y) / 3.0)
-
-    diff_peaks = np.abs(np.diff(peaks_neg_fin))
-    diff_peaks_annormal = diff_peaks[diff_peaks < 30]
-
-    return interest_neg_fin
-
-def return_regions_without_seperators_new(self, regions_pre, regions_only_text):
-    kernel = np.ones((5, 5), np.uint8)
-
-    regions_without_seperators = ((regions_pre[:, :] != 6) & (regions_pre[:, :] != 0) & (regions_pre[:, :] != 1) & (regions_pre[:, :] != 2)) * 1
-
-    # plt.imshow(regions_without_seperators)
-    # plt.show()
-
-    regions_without_seperators_n = ((regions_without_seperators[:, :] == 1) | (regions_only_text[:, :] == 1)) * 1
-
-    # regions_without_seperators=( (image_regions_eraly_p[:,:,:]!=6) & (image_regions_eraly_p[:,:,:]!=0) & (image_regions_eraly_p[:,:,:]!=5) & (image_regions_eraly_p[:,:,:]!=8) & (image_regions_eraly_p[:,:,:]!=7))*1
-
-    regions_without_seperators_n = regions_without_seperators_n.astype(np.uint8)
-
-    regions_without_seperators_n = cv2.erode(regions_without_seperators_n, kernel, iterations=6)
-
-    return regions_without_seperators_n
-
-def find_images_contours_and_replace_table_and_graphic_pixels_by_image(region_pre_p):
-
-    # pixels of images are identified by 5
-    cnts_images = (region_pre_p[:, :, 0] == 5) * 1
-    cnts_images = cnts_images.astype(np.uint8)
-    cnts_images = np.repeat(cnts_images[:, :, np.newaxis], 3, axis=2)
-    imgray = cv2.cvtColor(cnts_images, cv2.COLOR_BGR2GRAY)
-    ret, thresh = cv2.threshold(imgray, 0, 255, 0)
-    contours_imgs, hiearchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
-
-    contours_imgs = return_parent_contours(contours_imgs, hiearchy)
-    # print(len(contours_imgs),'contours_imgs')
-    contours_imgs = filter_contours_area_of_image_tables(thresh, contours_imgs, hiearchy, max_area=1, min_area=0.0003)
-
-    # print(len(contours_imgs),'contours_imgs')
-
-    boxes_imgs = return_bonding_box_of_contours(contours_imgs)
-
-    for i in range(len(boxes_imgs)):
-        x1 = int(boxes_imgs[i][0])
-        x2 = int(boxes_imgs[i][0] + boxes_imgs[i][2])
-        y1 = int(boxes_imgs[i][1])
-        y2 = int(boxes_imgs[i][1] + boxes_imgs[i][3])
-        region_pre_p[y1:y2, x1:x2, 0][region_pre_p[y1:y2, x1:x2, 0] == 8] = 5
-        region_pre_p[y1:y2, x1:x2, 0][region_pre_p[y1:y2, x1:x2, 0] == 7] = 5
-    return region_pre_p
-
-def order_and_id_of_texts_old(found_polygons_text_region, matrix_of_orders, indexes_sorted):
-    id_of_texts = []
-    order_of_texts = []
-    index_b = 0
-    for mm in range(len(found_polygons_text_region)):
-        id_of_texts.append("r" + str(index_b))
-        index_matrix = matrix_of_orders[:, 0][(matrix_of_orders[:, 1] == 1) & (matrix_of_orders[:, 4] == mm)]
-        order_of_texts.append(np.where(indexes_sorted == index_matrix)[0][0])
-
-        index_b += 1
-
-    order_of_texts
-    return order_of_texts, id_of_texts
-
-def order_of_regions_old(textline_mask, contours_main):
-    mada_n = textline_mask.sum(axis=1)
-    y = mada_n[:]
-
-    y_help = np.zeros(len(y) + 40)
-    y_help[20 : len(y) + 20] = y
-    x = np.array(range(len(y)))
-
-    peaks_real, _ = find_peaks(gaussian_filter1d(y, 3), height=0)
-
-    sigma_gaus = 8
-
-    z = gaussian_filter1d(y_help, sigma_gaus)
-    zneg_rev = -y_help + np.max(y_help)
-
-    zneg = np.zeros(len(zneg_rev) + 40)
-    zneg[20 : len(zneg_rev) + 20] = zneg_rev
-    zneg = gaussian_filter1d(zneg, sigma_gaus)
-
-    peaks, _ = find_peaks(z, height=0)
-    peaks_neg, _ = find_peaks(zneg, height=0)
-
-    peaks_neg = peaks_neg - 20 - 20
-    peaks = peaks - 20
-
-    if contours_main != None:
-        areas_main = np.array([cv2.contourArea(contours_main[j]) for j in range(len(contours_main))])
-        M_main = [cv2.moments(contours_main[j]) for j in range(len(contours_main))]
-        cx_main = [(M_main[j]["m10"] / (M_main[j]["m00"] + 1e-32)) for j in range(len(M_main))]
-        cy_main = [(M_main[j]["m01"] / (M_main[j]["m00"] + 1e-32)) for j in range(len(M_main))]
-        x_min_main = np.array([np.min(contours_main[j][:, 0, 0]) for j in range(len(contours_main))])
-        x_max_main = np.array([np.max(contours_main[j][:, 0, 0]) for j in range(len(contours_main))])
-
-        y_min_main = np.array([np.min(contours_main[j][:, 0, 1]) for j in range(len(contours_main))])
-        y_max_main = np.array([np.max(contours_main[j][:, 0, 1]) for j in range(len(contours_main))])
-
-    if contours_main != None:
-        indexer_main = np.array(range(len(contours_main)))
-
-    if contours_main != None:
-        len_main = len(contours_main)
-    else:
-        len_main = 0
-
-    matrix_of_orders = np.zeros((len_main, 5))
-
-    matrix_of_orders[:, 0] = np.array(range(len_main))
-
-    matrix_of_orders[:len_main, 1] = 1
-    matrix_of_orders[len_main:, 1] = 2
-
-    matrix_of_orders[:len_main, 2] = cx_main
-    matrix_of_orders[:len_main, 3] = cy_main
-
-    matrix_of_orders[:len_main, 4] = np.array(range(len_main))
-
-    peaks_neg_new = []
-    peaks_neg_new.append(0)
-    for iii in range(len(peaks_neg)):
-        peaks_neg_new.append(peaks_neg[iii])
-    peaks_neg_new.append(textline_mask.shape[0])
-
-    final_indexers_sorted = []
-    for i in range(len(peaks_neg_new) - 1):
-        top = peaks_neg_new[i]
-        down = peaks_neg_new[i + 1]
-
-        indexes_in = matrix_of_orders[:, 0][(matrix_of_orders[:, 3] >= top) & ((matrix_of_orders[:, 3] < down))]
-        cxs_in = matrix_of_orders[:, 2][(matrix_of_orders[:, 3] >= top) & ((matrix_of_orders[:, 3] < down))]
-
-        sorted_inside = np.argsort(cxs_in)
-
-        ind_in_int = indexes_in[sorted_inside]
-
-        for j in range(len(ind_in_int)):
-            final_indexers_sorted.append(int(ind_in_int[j]))
-
-    return final_indexers_sorted, matrix_of_orders
-
-def remove_headers_and_mains_intersection(seperators_closeup_n, img_revised_tab, boxes):
-    for ind in range(len(boxes)):
-        asp = np.zeros((img_revised_tab[:, :, 0].shape[0], seperators_closeup_n[:, :, 0].shape[1]))
-        asp[int(boxes[ind][2]) : int(boxes[ind][3]), int(boxes[ind][0]) : int(boxes[ind][1])] = img_revised_tab[int(boxes[ind][2]) : int(boxes[ind][3]), int(boxes[ind][0]) : int(boxes[ind][1]), 0]
-
-        head_patch_con = (asp[:, :] == 2) * 1
-        main_patch_con = (asp[:, :] == 1) * 1
-        # print(head_patch_con)
-        head_patch_con = head_patch_con.astype(np.uint8)
-        main_patch_con = main_patch_con.astype(np.uint8)
-
-        head_patch_con = np.repeat(head_patch_con[:, :, np.newaxis], 3, axis=2)
-        main_patch_con = np.repeat(main_patch_con[:, :, np.newaxis], 3, axis=2)
-
-        imgray = cv2.cvtColor(head_patch_con, cv2.COLOR_BGR2GRAY)
-        ret, thresh = cv2.threshold(imgray, 0, 255, 0)
-
-        contours_head_patch_con, hiearchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
-        contours_head_patch_con = return_parent_contours(contours_head_patch_con, hiearchy)
-
-        imgray = cv2.cvtColor(main_patch_con, cv2.COLOR_BGR2GRAY)
-        ret, thresh = cv2.threshold(imgray, 0, 255, 0)
-
-        contours_main_patch_con, hiearchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
-        contours_main_patch_con = return_parent_contours(contours_main_patch_con, hiearchy)
-
-        y_patch_head_min, y_patch_head_max, _ = find_features_of_contours(contours_head_patch_con)
-        y_patch_main_min, y_patch_main_max, _ = find_features_of_contours(contours_main_patch_con)
-
-        for i in range(len(y_patch_head_min)):
-            for j in range(len(y_patch_main_min)):
-                if y_patch_head_max[i] > y_patch_main_min[j] and y_patch_head_min[i] < y_patch_main_min[j]:
-                    y_down = y_patch_head_max[i]
-                    y_up = y_patch_main_min[j]
-
-                    patch_intersection = np.zeros(asp.shape)
-                    patch_intersection[y_up:y_down, :] = asp[y_up:y_down, :]
-
-                    head_patch_con = (patch_intersection[:, :] == 2) * 1
-                    main_patch_con = (patch_intersection[:, :] == 1) * 1
-                    head_patch_con = head_patch_con.astype(np.uint8)
-                    main_patch_con = main_patch_con.astype(np.uint8)
-
-                    head_patch_con = np.repeat(head_patch_con[:, :, np.newaxis], 3, axis=2)
-                    main_patch_con = np.repeat(main_patch_con[:, :, np.newaxis], 3, axis=2)
-
-                    imgray = cv2.cvtColor(head_patch_con, cv2.COLOR_BGR2GRAY)
-                    ret, thresh = cv2.threshold(imgray, 0, 255, 0)
-
-                    contours_head_patch_con, hiearchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
-                    contours_head_patch_con = return_parent_contours(contours_head_patch_con, hiearchy)
-
-                    imgray = cv2.cvtColor(main_patch_con, cv2.COLOR_BGR2GRAY)
-                    ret, thresh = cv2.threshold(imgray, 0, 255, 0)
-
-                    contours_main_patch_con, hiearchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
-                    contours_main_patch_con = return_parent_contours(contours_main_patch_con, hiearchy)
-
-                    _, _, areas_head = find_features_of_contours(contours_head_patch_con)
-                    _, _, areas_main = find_features_of_contours(contours_main_patch_con)
-
-                    if np.sum(areas_head) > np.sum(areas_main):
-                        img_revised_tab[y_up:y_down, int(boxes[ind][0]) : int(boxes[ind][1]), 0][img_revised_tab[y_up:y_down, int(boxes[ind][0]) : int(boxes[ind][1]), 0] == 1] = 2
-                    else:
-                        img_revised_tab[y_up:y_down, int(boxes[ind][0]) : int(boxes[ind][1]), 0][img_revised_tab[y_up:y_down, int(boxes[ind][0]) : int(boxes[ind][1]), 0] == 2] = 1
-
-                elif y_patch_head_min[i] < y_patch_main_max[j] and y_patch_head_max[i] > y_patch_main_max[j]:
-                    y_down = y_patch_main_max[j]
-                    y_up = y_patch_head_min[i]
-
-                    patch_intersection = np.zeros(asp.shape)
-                    patch_intersection[y_up:y_down, :] = asp[y_up:y_down, :]
-
-                    head_patch_con = (patch_intersection[:, :] == 2) * 1
-                    main_patch_con = (patch_intersection[:, :] == 1) * 1
-                    head_patch_con = head_patch_con.astype(np.uint8)
-                    main_patch_con = main_patch_con.astype(np.uint8)
-
-                    head_patch_con = np.repeat(head_patch_con[:, :, np.newaxis], 3, axis=2)
-                    main_patch_con = np.repeat(main_patch_con[:, :, np.newaxis], 3, axis=2)
-
-                    imgray = cv2.cvtColor(head_patch_con, cv2.COLOR_BGR2GRAY)
-                    ret, thresh = cv2.threshold(imgray, 0, 255, 0)
-
-                    contours_head_patch_con, hiearchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
-                    contours_head_patch_con = return_parent_contours(contours_head_patch_con, hiearchy)
-
-                    imgray = cv2.cvtColor(main_patch_con, cv2.COLOR_BGR2GRAY)
-                    ret, thresh = cv2.threshold(imgray, 0, 255, 0)
-
-                    contours_main_patch_con, hiearchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
-                    contours_main_patch_con = return_parent_contours(contours_main_patch_con, hiearchy)
-
-                    _, _, areas_head = find_features_of_contours(contours_head_patch_con)
-                    _, _, areas_main = find_features_of_contours(contours_main_patch_con)
-
-                    if np.sum(areas_head) > np.sum(areas_main):
-                        img_revised_tab[y_up:y_down, int(boxes[ind][0]) : int(boxes[ind][1]), 0][img_revised_tab[y_up:y_down, int(boxes[ind][0]) : int(boxes[ind][1]), 0] == 1] = 2
-                    else:
-                        img_revised_tab[y_up:y_down, int(boxes[ind][0]) : int(boxes[ind][1]), 0][img_revised_tab[y_up:y_down, int(boxes[ind][0]) : int(boxes[ind][1]), 0] == 2] = 1
-
-                    # print(np.unique(patch_intersection) )
-                    ##plt.figure(figsize=(20,20))
-                    ##plt.imshow(patch_intersection)
-                    ##plt.show()
-                else:
-                    pass
-
-    return img_revised_tab
-
-def tear_main_texts_on_the_boundaries_of_boxes(img_revised_tab, boxes):
-    for i in range(len(boxes)):
-        img_revised_tab[int(boxes[i][2]) : int(boxes[i][3]), int(boxes[i][1] - 10) : int(boxes[i][1]), 0][img_revised_tab[int(boxes[i][2]) : int(boxes[i][3]), int(boxes[i][1] - 10) : int(boxes[i][1]), 0] == 1] = 0
-        img_revised_tab[int(boxes[i][2]) : int(boxes[i][3]), int(boxes[i][1] - 10) : int(boxes[i][1]), 1][img_revised_tab[int(boxes[i][2]) : int(boxes[i][3]), int(boxes[i][1] - 10) : int(boxes[i][1]), 1] == 1] = 0
-        img_revised_tab[int(boxes[i][2]) : int(boxes[i][3]), int(boxes[i][1] - 10) : int(boxes[i][1]), 2][img_revised_tab[int(boxes[i][2]) : int(boxes[i][3]), int(boxes[i][1] - 10) : int(boxes[i][1]), 2] == 1] = 0
-    return img_revised_tab
-
-def combine_hor_lines_and_delete_cross_points_and_get_lines_features_back(self, regions_pre_p):
-    seperators_closeup = ((regions_pre_p[:, :] == 6)) * 1
-
-    seperators_closeup = seperators_closeup.astype(np.uint8)
-    kernel = np.ones((5, 5), np.uint8)
-
-    seperators_closeup = cv2.dilate(seperators_closeup, kernel, iterations=1)
-    seperators_closeup = cv2.erode(seperators_closeup, kernel, iterations=1)
-
-    seperators_closeup = cv2.erode(seperators_closeup, kernel, iterations=1)
-    seperators_closeup = cv2.dilate(seperators_closeup, kernel, iterations=1)
-
-    if len(seperators_closeup.shape) == 2:
-        seperators_closeup_n = np.zeros((seperators_closeup.shape[0], seperators_closeup.shape[1], 3))
-        seperators_closeup_n[:, :, 0] = seperators_closeup
-        seperators_closeup_n[:, :, 1] = seperators_closeup
-        seperators_closeup_n[:, :, 2] = seperators_closeup
-    else:
-        seperators_closeup_n = seperators_closeup[:, :, :]
-    # seperators_closeup=seperators_closeup.astype(np.uint8)
-    seperators_closeup_n = seperators_closeup_n.astype(np.uint8)
-    imgray = cv2.cvtColor(seperators_closeup_n, cv2.COLOR_BGR2GRAY)
-    ret, thresh = cv2.threshold(imgray, 0, 255, 0)
-    contours_lines, hierachy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
-    slope_lines, dist_x, x_min_main, x_max_main, cy_main, slope_lines_org, y_min_main, y_max_main, cx_main = find_features_of_lines(contours_lines)
-
-    dist_y = np.abs(y_max_main - y_min_main)
-
-    slope_lines_org_hor = slope_lines_org[slope_lines == 0]
-    args = np.array(range(len(slope_lines)))
-    len_x = seperators_closeup.shape[1] * 0
-    len_y = seperators_closeup.shape[0] * 0.01
-
-    args_hor = args[slope_lines == 0]
-    dist_x_hor = dist_x[slope_lines == 0]
-    dist_y_hor = dist_y[slope_lines == 0]
-    x_min_main_hor = x_min_main[slope_lines == 0]
-    x_max_main_hor = x_max_main[slope_lines == 0]
-    cy_main_hor = cy_main[slope_lines == 0]
-    y_min_main_hor = y_min_main[slope_lines == 0]
-    y_max_main_hor = y_max_main[slope_lines == 0]
-
-    args_hor = args_hor[dist_x_hor >= len_x]
-    x_max_main_hor = x_max_main_hor[dist_x_hor >= len_x]
-    x_min_main_hor = x_min_main_hor[dist_x_hor >= len_x]
-    cy_main_hor = cy_main_hor[dist_x_hor >= len_x]
-    y_min_main_hor = y_min_main_hor[dist_x_hor >= len_x]
-    y_max_main_hor = y_max_main_hor[dist_x_hor >= len_x]
-    slope_lines_org_hor = slope_lines_org_hor[dist_x_hor >= len_x]
-    dist_y_hor = dist_y_hor[dist_x_hor >= len_x]
-    dist_x_hor = dist_x_hor[dist_x_hor >= len_x]
-
-    args_ver = args[slope_lines == 1]
-    dist_y_ver = dist_y[slope_lines == 1]
-    dist_x_ver = dist_x[slope_lines == 1]
-    x_min_main_ver = x_min_main[slope_lines == 1]
-    x_max_main_ver = x_max_main[slope_lines == 1]
-    y_min_main_ver = y_min_main[slope_lines == 1]
-    y_max_main_ver = y_max_main[slope_lines == 1]
-    cx_main_ver = cx_main[slope_lines == 1]
-
-    args_ver = args_ver[dist_y_ver >= len_y]
-    x_max_main_ver = x_max_main_ver[dist_y_ver >= len_y]
-    x_min_main_ver = x_min_main_ver[dist_y_ver >= len_y]
-    cx_main_ver = cx_main_ver[dist_y_ver >= len_y]
-    y_min_main_ver = y_min_main_ver[dist_y_ver >= len_y]
-    y_max_main_ver = y_max_main_ver[dist_y_ver >= len_y]
-    dist_x_ver = dist_x_ver[dist_y_ver >= len_y]
-    dist_y_ver = dist_y_ver[dist_y_ver >= len_y]
-
-    img_p_in_ver = np.zeros(seperators_closeup_n[:, :, 2].shape)
-    for jv in range(len(args_ver)):
-        img_p_in_ver = cv2.fillPoly(img_p_in_ver, pts=[contours_lines[args_ver[jv]]], color=(1, 1, 1))
-
-    img_in_hor = np.zeros(seperators_closeup_n[:, :, 2].shape)
-    for jv in range(len(args_hor)):
-        img_p_in_hor = cv2.fillPoly(img_in_hor, pts=[contours_lines[args_hor[jv]]], color=(1, 1, 1))
-
-    all_args_uniq = contours_in_same_horizon(cy_main_hor)
-    # print(all_args_uniq,'all_args_uniq')
-    if len(all_args_uniq) > 0:
-        if type(all_args_uniq[0]) is list:
-            contours_new = []
-            for dd in range(len(all_args_uniq)):
-                merged_all = None
-                some_args = args_hor[all_args_uniq[dd]]
-                some_cy = cy_main_hor[all_args_uniq[dd]]
-                some_x_min = x_min_main_hor[all_args_uniq[dd]]
-                some_x_max = x_max_main_hor[all_args_uniq[dd]]
-
-                img_in = np.zeros(seperators_closeup_n[:, :, 2].shape)
-                for jv in range(len(some_args)):
-
-                    img_p_in = cv2.fillPoly(img_p_in_hor, pts=[contours_lines[some_args[jv]]], color=(1, 1, 1))
-                    img_p_in[int(np.mean(some_cy)) - 5 : int(np.mean(some_cy)) + 5, int(np.min(some_x_min)) : int(np.max(some_x_max))] = 1
-
-        else:
-            img_p_in = seperators_closeup
-    else:
-        img_p_in = seperators_closeup
-
-    sep_ver_hor = img_p_in + img_p_in_ver
-    sep_ver_hor_cross = (sep_ver_hor == 2) * 1
-
-    sep_ver_hor_cross = np.repeat(sep_ver_hor_cross[:, :, np.newaxis], 3, axis=2)
-    sep_ver_hor_cross = sep_ver_hor_cross.astype(np.uint8)
-    imgray = cv2.cvtColor(sep_ver_hor_cross, cv2.COLOR_BGR2GRAY)
-    ret, thresh = cv2.threshold(imgray, 0, 255, 0)
-    contours_cross, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
-
-    cx_cross, cy_cross, _, _, _, _, _ = find_new_features_of_contoures(contours_cross)
-
-    for ii in range(len(cx_cross)):
-        sep_ver_hor[int(cy_cross[ii]) - 15 : int(cy_cross[ii]) + 15, int(cx_cross[ii]) + 5 : int(cx_cross[ii]) + 40] = 0
-        sep_ver_hor[int(cy_cross[ii]) - 15 : int(cy_cross[ii]) + 15, int(cx_cross[ii]) - 40 : int(cx_cross[ii]) - 4] = 0
-
-    img_p_in[:, :] = sep_ver_hor[:, :]
-
-    if len(img_p_in.shape) == 2:
-        seperators_closeup_n = np.zeros((img_p_in.shape[0], img_p_in.shape[1], 3))
-        seperators_closeup_n[:, :, 0] = img_p_in
-        seperators_closeup_n[:, :, 1] = img_p_in
-        seperators_closeup_n[:, :, 2] = img_p_in
-    else:
-        seperators_closeup_n = img_p_in[:, :, :]
-    # seperators_closeup=seperators_closeup.astype(np.uint8)
-    seperators_closeup_n = seperators_closeup_n.astype(np.uint8)
-    imgray = cv2.cvtColor(seperators_closeup_n, cv2.COLOR_BGR2GRAY)
-    ret, thresh = cv2.threshold(imgray, 0, 255, 0)
-
-    contours_lines, hierachy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
-
-    slope_lines, dist_x, x_min_main, x_max_main, cy_main, slope_lines_org, y_min_main, y_max_main, cx_main = find_features_of_lines(contours_lines)
-
-    dist_y = np.abs(y_max_main - y_min_main)
-
-    slope_lines_org_hor = slope_lines_org[slope_lines == 0]
-    args = np.array(range(len(slope_lines)))
-    len_x = seperators_closeup.shape[1] * 0.04
-    len_y = seperators_closeup.shape[0] * 0.08
-
-    args_hor = args[slope_lines == 0]
-    dist_x_hor = dist_x[slope_lines == 0]
-    dist_y_hor = dist_y[slope_lines == 0]
-    x_min_main_hor = x_min_main[slope_lines == 0]
-    x_max_main_hor = x_max_main[slope_lines == 0]
-    cy_main_hor = cy_main[slope_lines == 0]
-    y_min_main_hor = y_min_main[slope_lines == 0]
-    y_max_main_hor = y_max_main[slope_lines == 0]
-
-    args_hor = args_hor[dist_x_hor >= len_x]
-    x_max_main_hor = x_max_main_hor[dist_x_hor >= len_x]
-    x_min_main_hor = x_min_main_hor[dist_x_hor >= len_x]
-    cy_main_hor = cy_main_hor[dist_x_hor >= len_x]
-    y_min_main_hor = y_min_main_hor[dist_x_hor >= len_x]
-    y_max_main_hor = y_max_main_hor[dist_x_hor >= len_x]
-    slope_lines_org_hor = slope_lines_org_hor[dist_x_hor >= len_x]
-    dist_y_hor = dist_y_hor[dist_x_hor >= len_x]
-    dist_x_hor = dist_x_hor[dist_x_hor >= len_x]
-
-    args_ver = args[slope_lines == 1]
-    dist_y_ver = dist_y[slope_lines == 1]
-    dist_x_ver = dist_x[slope_lines == 1]
-    x_min_main_ver = x_min_main[slope_lines == 1]
-    x_max_main_ver = x_max_main[slope_lines == 1]
-    y_min_main_ver = y_min_main[slope_lines == 1]
-    y_max_main_ver = y_max_main[slope_lines == 1]
-    cx_main_ver = cx_main[slope_lines == 1]
-
-    args_ver = args_ver[dist_y_ver >= len_y]
-    x_max_main_ver = x_max_main_ver[dist_y_ver >= len_y]
-    x_min_main_ver = x_min_main_ver[dist_y_ver >= len_y]
-    cx_main_ver = cx_main_ver[dist_y_ver >= len_y]
-    y_min_main_ver = y_min_main_ver[dist_y_ver >= len_y]
-    y_max_main_ver = y_max_main_ver[dist_y_ver >= len_y]
-    dist_x_ver = dist_x_ver[dist_y_ver >= len_y]
-    dist_y_ver = dist_y_ver[dist_y_ver >= len_y]
-
-    matrix_of_lines_ch = np.zeros((len(cy_main_hor) + len(cx_main_ver), 10))
-
-    matrix_of_lines_ch[: len(cy_main_hor), 0] = args_hor
-    matrix_of_lines_ch[len(cy_main_hor) :, 0] = args_ver
-
-    matrix_of_lines_ch[len(cy_main_hor) :, 1] = cx_main_ver
-
-    matrix_of_lines_ch[: len(cy_main_hor), 2] = x_min_main_hor
-    matrix_of_lines_ch[len(cy_main_hor) :, 2] = x_min_main_ver
-
-    matrix_of_lines_ch[: len(cy_main_hor), 3] = x_max_main_hor
-    matrix_of_lines_ch[len(cy_main_hor) :, 3] = x_max_main_ver
-
-    matrix_of_lines_ch[: len(cy_main_hor), 4] = dist_x_hor
-    matrix_of_lines_ch[len(cy_main_hor) :, 4] = dist_x_ver
-
-    matrix_of_lines_ch[: len(cy_main_hor), 5] = cy_main_hor
-
-    matrix_of_lines_ch[: len(cy_main_hor), 6] = y_min_main_hor
-    matrix_of_lines_ch[len(cy_main_hor) :, 6] = y_min_main_ver
-
-    matrix_of_lines_ch[: len(cy_main_hor), 7] = y_max_main_hor
-    matrix_of_lines_ch[len(cy_main_hor) :, 7] = y_max_main_ver
-
-    matrix_of_lines_ch[: len(cy_main_hor), 8] = dist_y_hor
-    matrix_of_lines_ch[len(cy_main_hor) :, 8] = dist_y_ver
-
-    matrix_of_lines_ch[len(cy_main_hor) :, 9] = 1
-
-    return matrix_of_lines_ch, seperators_closeup_n
-
-def image_change_background_pixels_to_zero(self, image_page):
-    image_back_zero = np.zeros((image_page.shape[0], image_page.shape[1]))
-    image_back_zero[:, :] = image_page[:, :, 0]
-    image_back_zero[:, :][image_back_zero[:, :] == 0] = -255
-    image_back_zero[:, :][image_back_zero[:, :] == 255] = 0
-    image_back_zero[:, :][image_back_zero[:, :] == -255] = 255
-    return image_back_zero
-
-def return_boxes_of_images_by_order_of_reading_without_seperator(spliter_y_new, image_p_rev, regions_without_seperators, matrix_of_lines_ch, seperators_closeup_n):
-
-    boxes = []
-
-    # here I go through main spliters and i do check whether a vertical seperator there is. If so i am searching for \
-    # holes in the text and also finding spliter which covers more than one columns.
-    for i in range(len(spliter_y_new) - 1):
-        # print(spliter_y_new[i],spliter_y_new[i+1])
-        matrix_new = matrix_of_lines_ch[:, :][(matrix_of_lines_ch[:, 6] > spliter_y_new[i]) & (matrix_of_lines_ch[:, 7] < spliter_y_new[i + 1])]
-        # print(len( matrix_new[:,9][matrix_new[:,9]==1] ))
-
-        # print(matrix_new[:,8][matrix_new[:,9]==1],'gaddaaa')
-
-        # check to see is there any vertical seperator to find holes.
-        if np.abs(spliter_y_new[i + 1] - spliter_y_new[i]) > 1.0 / 3.0 * regions_without_seperators.shape[0]:  # len( matrix_new[:,9][matrix_new[:,9]==1] )>0 and np.max(matrix_new[:,8][matrix_new[:,9]==1])>=0.1*(np.abs(spliter_y_new[i+1]-spliter_y_new[i] )):
-
-            # org_img_dichte=-gaussian_filter1d(( image_page[int(spliter_y_new[i]):int(spliter_y_new[i+1]),:,0]/255.).sum(axis=0) ,30)
-            # org_img_dichte=org_img_dichte-np.min(org_img_dichte)
-            ##plt.figure(figsize=(20,20))
-            ##plt.plot(org_img_dichte)
-            ##plt.show()
-            ###find_num_col_both_layout_and_org(regions_without_seperators,image_page[int(spliter_y_new[i]):int(spliter_y_new[i+1]),:,:],7.)
-
-            num_col, peaks_neg_fin = find_num_col_only_image(image_p_rev[int(spliter_y_new[i]) : int(spliter_y_new[i + 1]), :], multiplier=2.4)
-
-            # num_col, peaks_neg_fin=find_num_col(regions_without_seperators[int(spliter_y_new[i]):int(spliter_y_new[i+1]),:],multiplier=7.0)
-            x_min_hor_some = matrix_new[:, 2][(matrix_new[:, 9] == 0)]
-            x_max_hor_some = matrix_new[:, 3][(matrix_new[:, 9] == 0)]
-            cy_hor_some = matrix_new[:, 5][(matrix_new[:, 9] == 0)]
-            arg_org_hor_some = matrix_new[:, 0][(matrix_new[:, 9] == 0)]
-
-            peaks_neg_tot = return_points_with_boundies(peaks_neg_fin, 0, seperators_closeup_n[:, :, 0].shape[1])
-
-            start_index_of_hor, newest_peaks, arg_min_hor_sort, lines_length_dels, lines_indexes_deleted = return_hor_spliter_by_index_for_without_verticals(peaks_neg_tot, x_min_hor_some, x_max_hor_some)
-
-            arg_org_hor_some_sort = arg_org_hor_some[arg_min_hor_sort]
-
-            start_index_of_hor_with_subset = [start_index_of_hor[vij] for vij in range(len(start_index_of_hor)) if lines_length_dels[vij] > 0]  # start_index_of_hor[lines_length_dels>0]
-            arg_min_hor_sort_with_subset = [arg_min_hor_sort[vij] for vij in range(len(start_index_of_hor)) if lines_length_dels[vij] > 0]
-            lines_indexes_deleted_with_subset = [lines_indexes_deleted[vij] for vij in range(len(start_index_of_hor)) if lines_length_dels[vij] > 0]
-            lines_length_dels_with_subset = [lines_length_dels[vij] for vij in range(len(start_index_of_hor)) if lines_length_dels[vij] > 0]
-
-            arg_org_hor_some_sort_subset = [arg_org_hor_some_sort[vij] for vij in range(len(start_index_of_hor)) if lines_length_dels[vij] > 0]
-
-            # arg_min_hor_sort_with_subset=arg_min_hor_sort[lines_length_dels>0]
-            # lines_indexes_deleted_with_subset=lines_indexes_deleted[lines_length_dels>0]
-            # lines_length_dels_with_subset=lines_length_dels[lines_length_dels>0]
-
-            # print(len(arg_min_hor_sort),len(arg_org_hor_some_sort),'vizzzzzz')
-
-            vahid_subset = np.zeros((len(start_index_of_hor_with_subset), len(start_index_of_hor_with_subset))) - 1
-            for kkk1 in range(len(start_index_of_hor_with_subset)):
-
-                # print(lines_indexes_deleted,'hiii')
-                index_del_sub = np.unique(lines_indexes_deleted_with_subset[kkk1])
-
-                for kkk2 in range(len(start_index_of_hor_with_subset)):
-
-                    if set(lines_indexes_deleted_with_subset[kkk2][0]) < set(lines_indexes_deleted_with_subset[kkk1][0]):
-                        vahid_subset[kkk1, kkk2] = kkk1
-                    else:
-                        pass
-                # print(set(lines_indexes_deleted[kkk2][0]), set(lines_indexes_deleted[kkk1][0]))
-
-            # check the len of matrix if it has no length means that there is no spliter at all
-
-            if len(vahid_subset > 0):
-                # print('hihoo')
-
-                # find parenets args
-                line_int = np.zeros(vahid_subset.shape[0])
-
-                childs_id = []
-                arg_child = []
-                for li in range(vahid_subset.shape[0]):
-                    if np.all(vahid_subset[:, li] == -1):
-                        line_int[li] = -1
-                    else:
-                        line_int[li] = 1
-
-                        # childs_args_in=[ idd for idd in range(vahid_subset.shape[0]) if vahid_subset[idd,li]!=-1]
-                        # helpi=[]
-                        # for nad in range(len(childs_args_in)):
-                        #    helpi.append(arg_min_hor_sort_with_subset[childs_args_in[nad]])
-
-                        arg_child.append(arg_min_hor_sort_with_subset[li])
-
-                arg_parent = [arg_min_hor_sort_with_subset[vij] for vij in range(len(arg_min_hor_sort_with_subset)) if line_int[vij] == -1]
-                start_index_of_hor_parent = [start_index_of_hor_with_subset[vij] for vij in range(len(arg_min_hor_sort_with_subset)) if line_int[vij] == -1]
-                # arg_parent=[lines_indexes_deleted_with_subset[vij] for vij in range(len(arg_min_hor_sort_with_subset)) if line_int[vij]==-1]
-                # arg_parent=[lines_length_dels_with_subset[vij] for vij in range(len(arg_min_hor_sort_with_subset)) if line_int[vij]==-1]
-
-                # arg_child=[arg_min_hor_sort_with_subset[vij] for vij in range(len(arg_min_hor_sort_with_subset)) if line_int[vij]!=-1]
-                start_index_of_hor_child = [start_index_of_hor_with_subset[vij] for vij in range(len(arg_min_hor_sort_with_subset)) if line_int[vij] != -1]
-
-                cy_hor_some_sort = cy_hor_some[arg_parent]
-
-                newest_y_spliter_tot = []
-
-                for tj in range(len(newest_peaks) - 1):
-                    newest_y_spliter = []
-                    newest_y_spliter.append(spliter_y_new[i])
-                    if tj in np.unique(start_index_of_hor_parent):
-                        cy_help = np.array(cy_hor_some_sort)[np.array(start_index_of_hor_parent) == tj]
-                        cy_help_sort = np.sort(cy_help)
-
-                        # print(tj,cy_hor_some_sort,start_index_of_hor,cy_help,'maashhaha')
-                        for mj in range(len(cy_help_sort)):
-                            newest_y_spliter.append(cy_help_sort[mj])
-                    newest_y_spliter.append(spliter_y_new[i + 1])
-
-                    newest_y_spliter_tot.append(newest_y_spliter)
-
-            else:
-                line_int = []
-                newest_y_spliter_tot = []
-
-                for tj in range(len(newest_peaks) - 1):
-                    newest_y_spliter = []
-                    newest_y_spliter.append(spliter_y_new[i])
-
-                    newest_y_spliter.append(spliter_y_new[i + 1])
-
-                    newest_y_spliter_tot.append(newest_y_spliter)
-
-            # if line_int is all -1 means that big spliters have no child and we can easily go through
-            if np.all(np.array(line_int) == -1):
-                for j in range(len(newest_peaks) - 1):
-                    newest_y_spliter = newest_y_spliter_tot[j]
-
-                    for n in range(len(newest_y_spliter) - 1):
-                        # print(j,newest_y_spliter[n],newest_y_spliter[n+1],newest_peaks[j],newest_peaks[j+1],'maaaa')
-                        ##plt.imshow(regions_without_seperators[int(newest_y_spliter[n]):int(newest_y_spliter[n+1]),newest_peaks[j]:newest_peaks[j+1]])
-                        ##plt.show()
-
-                        # print(matrix_new[:,0][ (matrix_new[:,9]==1 )])
-                        for jvt in matrix_new[:, 0][(matrix_new[:, 9] == 1) & (matrix_new[:, 6] > newest_y_spliter[n]) & (matrix_new[:, 7] < newest_y_spliter[n + 1]) & ((matrix_new[:, 1]) < newest_peaks[j + 1]) & ((matrix_new[:, 1]) > newest_peaks[j])]:
-                            pass
-
-                            ###plot_contour(regions_without_seperators.shape[0],regions_without_seperators.shape[1], contours_lines[int(jvt)])
-                        # print(matrix_of_lines_ch[matrix_of_lines_ch[:,9]==1])
-                        matrix_new_new = matrix_of_lines_ch[:, :][(matrix_of_lines_ch[:, 9] == 1) & (matrix_of_lines_ch[:, 6] > newest_y_spliter[n]) & (matrix_of_lines_ch[:, 7] < newest_y_spliter[n + 1]) & ((matrix_of_lines_ch[:, 1] + 500) < newest_peaks[j + 1]) & ((matrix_of_lines_ch[:, 1] - 500) > newest_peaks[j])]
-                        # print(matrix_new_new,newest_y_spliter[n],newest_y_spliter[n+1],newest_peaks[j],newest_peaks[j+1],'gada')
-                        if 1 > 0:  # len( matrix_new_new[:,9][matrix_new_new[:,9]==1] )>0 and np.max(matrix_new_new[:,8][matrix_new_new[:,9]==1])>=0.2*(np.abs(newest_y_spliter[n+1]-newest_y_spliter[n] )):
-                            # num_col_sub, peaks_neg_fin_sub=find_num_col(regions_without_seperators[int(newest_y_spliter[n]):int(newest_y_spliter[n+1]),newest_peaks[j]:newest_peaks[j+1]],multiplier=2.3)
-                            num_col_sub, peaks_neg_fin_sub = find_num_col_only_image(image_p_rev[int(newest_y_spliter[n]) : int(newest_y_spliter[n + 1]), newest_peaks[j] : newest_peaks[j + 1]], multiplier=2.4)
-                        else:
-                            peaks_neg_fin_sub = []
-
-                        peaks_sub = []
-                        peaks_sub.append(newest_peaks[j])
-
-                        for kj in range(len(peaks_neg_fin_sub)):
-                            peaks_sub.append(peaks_neg_fin_sub[kj] + newest_peaks[j])
-
-                        peaks_sub.append(newest_peaks[j + 1])
-
-                        # peaks_sub=return_points_with_boundies(peaks_neg_fin_sub+newest_peaks[j],newest_peaks[j], newest_peaks[j+1])
-
-                        for kh in range(len(peaks_sub) - 1):
-                            boxes.append([peaks_sub[kh], peaks_sub[kh + 1], newest_y_spliter[n], newest_y_spliter[n + 1]])
-
-            else:
-                for j in range(len(newest_peaks) - 1):
-                    newest_y_spliter = newest_y_spliter_tot[j]
-
-                    if j in start_index_of_hor_parent:
-
-                        x_min_ch = x_min_hor_some[arg_child]
-                        x_max_ch = x_max_hor_some[arg_child]
-                        cy_hor_some_sort_child = cy_hor_some[arg_child]
-                        cy_hor_some_sort_child = np.sort(cy_hor_some_sort_child)
-
-                        for n in range(len(newest_y_spliter) - 1):
-
-                            cy_child_in = cy_hor_some_sort_child[(cy_hor_some_sort_child > newest_y_spliter[n]) & (cy_hor_some_sort_child < newest_y_spliter[n + 1])]
-
-                            if len(cy_child_in) > 0:
-                                ###num_col_ch, peaks_neg_ch=find_num_col( regions_without_seperators[int(newest_y_spliter[n]):int(newest_y_spliter[n+1]),newest_peaks[j]:newest_peaks[j+1]],multiplier=2.3)
-
-                                num_col_ch, peaks_neg_ch = find_num_col_only_image(image_p_rev[int(newest_y_spliter[n]) : int(newest_y_spliter[n + 1]), newest_peaks[j] : newest_peaks[j + 1]], multiplier=2.3)
-
-                                peaks_neg_ch = peaks_neg_ch[:] + newest_peaks[j]
-
-                                peaks_neg_ch_tot = return_points_with_boundies(peaks_neg_ch, newest_peaks[j], newest_peaks[j + 1])
-
-                                ss_in_ch, nst_p_ch, arg_n_ch, lines_l_del_ch, lines_in_del_ch = return_hor_spliter_by_index_for_without_verticals(peaks_neg_ch_tot, x_min_ch, x_max_ch)
-
-                                newest_y_spliter_ch_tot = []
-
-                                for tjj in range(len(nst_p_ch) - 1):
-                                    newest_y_spliter_new = []
-                                    newest_y_spliter_new.append(newest_y_spliter[n])
-                                    if tjj in np.unique(ss_in_ch):
-
-                                        # print(tj,cy_hor_some_sort,start_index_of_hor,cy_help,'maashhaha')
-                                        for mjj in range(len(cy_child_in)):
-                                            newest_y_spliter_new.append(cy_child_in[mjj])
-                                    newest_y_spliter_new.append(newest_y_spliter[n + 1])
-
-                                    newest_y_spliter_ch_tot.append(newest_y_spliter_new)
-
-                                for jn in range(len(nst_p_ch) - 1):
-                                    newest_y_spliter_h = newest_y_spliter_ch_tot[jn]
-
-                                    for nd in range(len(newest_y_spliter_h) - 1):
-
-                                        matrix_new_new2 = matrix_of_lines_ch[:, :][(matrix_of_lines_ch[:, 9] == 1) & (matrix_of_lines_ch[:, 6] > newest_y_spliter_h[nd]) & (matrix_of_lines_ch[:, 7] < newest_y_spliter_h[nd + 1]) & ((matrix_of_lines_ch[:, 1] + 500) < nst_p_ch[jn + 1]) & ((matrix_of_lines_ch[:, 1] - 500) > nst_p_ch[jn])]
-                                        # print(matrix_new_new,newest_y_spliter[n],newest_y_spliter[n+1],newest_peaks[j],newest_peaks[j+1],'gada')
-                                        if 1 > 0:  # len( matrix_new_new2[:,9][matrix_new_new2[:,9]==1] )>0 and np.max(matrix_new_new2[:,8][matrix_new_new2[:,9]==1])>=0.2*(np.abs(newest_y_spliter_h[nd+1]-newest_y_spliter_h[nd] )):
-                                            # num_col_sub_ch, peaks_neg_fin_sub_ch=find_num_col(regions_without_seperators[int(newest_y_spliter_h[nd]):int(newest_y_spliter_h[nd+1]),nst_p_ch[jn]:nst_p_ch[jn+1]],multiplier=2.3)
-
-                                            num_col_sub_ch, peaks_neg_fin_sub_ch = find_num_col_only_image(image_p_rev[int(newest_y_spliter_h[nd]) : int(newest_y_spliter_h[nd + 1]), nst_p_ch[jn] : nst_p_ch[jn + 1]], multiplier=2.3)
-                                            # print(peaks_neg_fin_sub_ch,'gada kutullllllll')
-                                        else:
-                                            peaks_neg_fin_sub_ch = []
-
-                                        peaks_sub_ch = []
-                                        peaks_sub_ch.append(nst_p_ch[jn])
-
-                                        for kjj in range(len(peaks_neg_fin_sub_ch)):
-                                            peaks_sub_ch.append(peaks_neg_fin_sub_ch[kjj] + nst_p_ch[jn])
-
-                                        peaks_sub_ch.append(nst_p_ch[jn + 1])
-
-                                        # peaks_sub=return_points_with_boundies(peaks_neg_fin_sub+newest_peaks[j],newest_peaks[j], newest_peaks[j+1])
-
-                                        for khh in range(len(peaks_sub_ch) - 1):
-                                            boxes.append([peaks_sub_ch[khh], peaks_sub_ch[khh + 1], newest_y_spliter_h[nd], newest_y_spliter_h[nd + 1]])
-
-                            else:
-
-                                matrix_new_new = matrix_of_lines_ch[:, :][(matrix_of_lines_ch[:, 9] == 1) & (matrix_of_lines_ch[:, 6] > newest_y_spliter[n]) & (matrix_of_lines_ch[:, 7] < newest_y_spliter[n + 1]) & ((matrix_of_lines_ch[:, 1] + 500) < newest_peaks[j + 1]) & ((matrix_of_lines_ch[:, 1] - 500) > newest_peaks[j])]
-                                # print(matrix_new_new,newest_y_spliter[n],newest_y_spliter[n+1],newest_peaks[j],newest_peaks[j+1],'gada')
-                                if 1 > 0:  # len( matrix_new_new[:,9][matrix_new_new[:,9]==1] )>0 and np.max(matrix_new_new[:,8][matrix_new_new[:,9]==1])>=0.2*(np.abs(newest_y_spliter[n+1]-newest_y_spliter[n] )):
-                                    ###num_col_sub, peaks_neg_fin_sub=find_num_col(regions_without_seperators[int(newest_y_spliter[n]):int(newest_y_spliter[n+1]),newest_peaks[j]:newest_peaks[j+1]],multiplier=2.3)
-                                    num_col_sub, peaks_neg_fin_sub = find_num_col_only_image(image_p_rev[int(newest_y_spliter[n]) : int(newest_y_spliter[n + 1]), newest_peaks[j] : newest_peaks[j + 1]], multiplier=2.3)
-                                else:
-                                    peaks_neg_fin_sub = []
-
-                                peaks_sub = []
-                                peaks_sub.append(newest_peaks[j])
-
-                                for kj in range(len(peaks_neg_fin_sub)):
-                                    peaks_sub.append(peaks_neg_fin_sub[kj] + newest_peaks[j])
-
-                                peaks_sub.append(newest_peaks[j + 1])
-
-                                # peaks_sub=return_points_with_boundies(peaks_neg_fin_sub+newest_peaks[j],newest_peaks[j], newest_peaks[j+1])
-
-                                for kh in range(len(peaks_sub) - 1):
-                                    boxes.append([peaks_sub[kh], peaks_sub[kh + 1], newest_y_spliter[n], newest_y_spliter[n + 1]])
-
-                    else:
-                        for n in range(len(newest_y_spliter) - 1):
-
-                            for jvt in matrix_new[:, 0][(matrix_new[:, 9] == 1) & (matrix_new[:, 6] > newest_y_spliter[n]) & (matrix_new[:, 7] < newest_y_spliter[n + 1]) & ((matrix_new[:, 1]) < newest_peaks[j + 1]) & ((matrix_new[:, 1]) > newest_peaks[j])]:
-                                pass
-
-                                # plot_contour(regions_without_seperators.shape[0],regions_without_seperators.shape[1], contours_lines[int(jvt)])
-                            # print(matrix_of_lines_ch[matrix_of_lines_ch[:,9]==1])
-                            matrix_new_new = matrix_of_lines_ch[:, :][(matrix_of_lines_ch[:, 9] == 1) & (matrix_of_lines_ch[:, 6] > newest_y_spliter[n]) & (matrix_of_lines_ch[:, 7] < newest_y_spliter[n + 1]) & ((matrix_of_lines_ch[:, 1] + 500) < newest_peaks[j + 1]) & ((matrix_of_lines_ch[:, 1] - 500) > newest_peaks[j])]
-                            # print(matrix_new_new,newest_y_spliter[n],newest_y_spliter[n+1],newest_peaks[j],newest_peaks[j+1],'gada')
-                            if 1 > 0:  # len( matrix_new_new[:,9][matrix_new_new[:,9]==1] )>0 and np.max(matrix_new_new[:,8][matrix_new_new[:,9]==1])>=0.2*(np.abs(newest_y_spliter[n+1]-newest_y_spliter[n] )):
-                                ###num_col_sub, peaks_neg_fin_sub=find_num_col(regions_without_seperators[int(newest_y_spliter[n]):int(newest_y_spliter[n+1]),newest_peaks[j]:newest_peaks[j+1]],multiplier=5.0)
-                                num_col_sub, peaks_neg_fin_sub = find_num_col_only_image(image_p_rev[int(newest_y_spliter[n]) : int(newest_y_spliter[n + 1]), newest_peaks[j] : newest_peaks[j + 1]], multiplier=2.3)
-                            else:
-                                peaks_neg_fin_sub = []
-
-                            peaks_sub = []
-                            peaks_sub.append(newest_peaks[j])
-
-                            for kj in range(len(peaks_neg_fin_sub)):
-                                peaks_sub.append(peaks_neg_fin_sub[kj] + newest_peaks[j])
-
-                            peaks_sub.append(newest_peaks[j + 1])
-
-                            # peaks_sub=return_points_with_boundies(peaks_neg_fin_sub+newest_peaks[j],newest_peaks[j], newest_peaks[j+1])
-
-                            for kh in range(len(peaks_sub) - 1):
-                                boxes.append([peaks_sub[kh], peaks_sub[kh + 1], newest_y_spliter[n], newest_y_spliter[n + 1]])
-
-        else:
-            boxes.append([0, seperators_closeup_n[:, :, 0].shape[1], spliter_y_new[i], spliter_y_new[i + 1]])
-    return boxes
-
-def return_region_segmentation_after_implementing_not_head_maintext_parallel(image_regions_eraly_p, boxes):
-    image_revised = np.zeros((image_regions_eraly_p.shape[0], image_regions_eraly_p.shape[1]))
-    for i in range(len(boxes)):
-
-        image_box = image_regions_eraly_p[int(boxes[i][2]) : int(boxes[i][3]), int(boxes[i][0]) : int(boxes[i][1])]
-        image_box = np.array(image_box)
-        # plt.imshow(image_box)
-        # plt.show()
-
-        # print(int(boxes[i][2]),int(boxes[i][3]),int(boxes[i][0]),int(boxes[i][1]),'addaa')
-        image_box = implent_law_head_main_not_parallel(image_box)
-        image_box = implent_law_head_main_not_parallel(image_box)
-        image_box = implent_law_head_main_not_parallel(image_box)
-
-        image_revised[int(boxes[i][2]) : int(boxes[i][3]), int(boxes[i][0]) : int(boxes[i][1])] = image_box[:, :]
-    return image_revised
-
-def return_boxes_of_images_by_order_of_reading_2cols(spliter_y_new, regions_without_seperators, matrix_of_lines_ch, seperators_closeup_n):
-    boxes = []
-
-    # here I go through main spliters and i do check whether a vertical seperator there is. If so i am searching for \
-    # holes in the text and also finding spliter which covers more than one columns.
-    for i in range(len(spliter_y_new) - 1):
-        # print(spliter_y_new[i],spliter_y_new[i+1])
-        matrix_new = matrix_of_lines_ch[:, :][(matrix_of_lines_ch[:, 6] > spliter_y_new[i]) & (matrix_of_lines_ch[:, 7] < spliter_y_new[i + 1])]
-        # print(len( matrix_new[:,9][matrix_new[:,9]==1] ))
-
-        # print(matrix_new[:,8][matrix_new[:,9]==1],'gaddaaa')
-
-        # check to see is there any vertical seperator to find holes.
-        if 1 > 0:  # len( matrix_new[:,9][matrix_new[:,9]==1] )>0 and np.max(matrix_new[:,8][matrix_new[:,9]==1])>=0.1*(np.abs(spliter_y_new[i+1]-spliter_y_new[i] )):
-            # print(int(spliter_y_new[i]),int(spliter_y_new[i+1]),'burayaaaa galimiirrrrrrrrrrrrrrrrrrrrrrrrrrr')
-            # org_img_dichte=-gaussian_filter1d(( image_page[int(spliter_y_new[i]):int(spliter_y_new[i+1]),:,0]/255.).sum(axis=0) ,30)
-            # org_img_dichte=org_img_dichte-np.min(org_img_dichte)
-            ##plt.figure(figsize=(20,20))
-            ##plt.plot(org_img_dichte)
-            ##plt.show()
-            ###find_num_col_both_layout_and_org(regions_without_seperators,image_page[int(spliter_y_new[i]):int(spliter_y_new[i+1]),:,:],7.)
-
-            try:
-                num_col, peaks_neg_fin = find_num_col(regions_without_seperators[int(spliter_y_new[i]) : int(spliter_y_new[i + 1]), :], multiplier=7.0)
-
-            except:
-                peaks_neg_fin = []
-                num_col = 0
-
-            peaks_neg_tot = return_points_with_boundies(peaks_neg_fin, 0, seperators_closeup_n[:, :, 0].shape[1])
-
-            for kh in range(len(peaks_neg_tot) - 1):
-                boxes.append([peaks_neg_tot[kh], peaks_neg_tot[kh + 1], spliter_y_new[i], spliter_y_new[i + 1]])
-
-        else:
-            boxes.append([0, seperators_closeup_n[:, :, 0].shape[1], spliter_y_new[i], spliter_y_new[i + 1]])
-
-    return boxes
-
-def return_boxes_of_images_by_order_of_reading(spliter_y_new, regions_without_seperators, matrix_of_lines_ch, seperators_closeup_n):
-    boxes = []
-
-    # here I go through main spliters and i do check whether a vertical seperator there is. If so i am searching for \
-    # holes in the text and also finding spliter which covers more than one columns.
-    for i in range(len(spliter_y_new) - 1):
-        # print(spliter_y_new[i],spliter_y_new[i+1])
-        matrix_new = matrix_of_lines_ch[:, :][(matrix_of_lines_ch[:, 6] > spliter_y_new[i]) & (matrix_of_lines_ch[:, 7] < spliter_y_new[i + 1])]
-        # print(len( matrix_new[:,9][matrix_new[:,9]==1] ))
-
-        # print(matrix_new[:,8][matrix_new[:,9]==1],'gaddaaa')
-
-        # check to see is there any vertical seperator to find holes.
-        if len(matrix_new[:, 9][matrix_new[:, 9] == 1]) > 0 and np.max(matrix_new[:, 8][matrix_new[:, 9] == 1]) >= 0.1 * (np.abs(spliter_y_new[i + 1] - spliter_y_new[i])):
-
-            # org_img_dichte=-gaussian_filter1d(( image_page[int(spliter_y_new[i]):int(spliter_y_new[i+1]),:,0]/255.).sum(axis=0) ,30)
-            # org_img_dichte=org_img_dichte-np.min(org_img_dichte)
-            ##plt.figure(figsize=(20,20))
-            ##plt.plot(org_img_dichte)
-            ##plt.show()
-            ###find_num_col_both_layout_and_org(regions_without_seperators,image_page[int(spliter_y_new[i]):int(spliter_y_new[i+1]),:,:],7.)
-
-            num_col, peaks_neg_fin = find_num_col(regions_without_seperators[int(spliter_y_new[i]) : int(spliter_y_new[i + 1]), :], multiplier=7.0)
-
-            # num_col, peaks_neg_fin=find_num_col(regions_without_seperators[int(spliter_y_new[i]):int(spliter_y_new[i+1]),:],multiplier=7.0)
-            x_min_hor_some = matrix_new[:, 2][(matrix_new[:, 9] == 0)]
-            x_max_hor_some = matrix_new[:, 3][(matrix_new[:, 9] == 0)]
-            cy_hor_some = matrix_new[:, 5][(matrix_new[:, 9] == 0)]
-            arg_org_hor_some = matrix_new[:, 0][(matrix_new[:, 9] == 0)]
-
-            peaks_neg_tot = return_points_with_boundies(peaks_neg_fin, 0, seperators_closeup_n[:, :, 0].shape[1])
-
-            start_index_of_hor, newest_peaks, arg_min_hor_sort, lines_length_dels, lines_indexes_deleted = return_hor_spliter_by_index(peaks_neg_tot, x_min_hor_some, x_max_hor_some)
-
-            arg_org_hor_some_sort = arg_org_hor_some[arg_min_hor_sort]
-
-            start_index_of_hor_with_subset = [start_index_of_hor[vij] for vij in range(len(start_index_of_hor)) if lines_length_dels[vij] > 0]  # start_index_of_hor[lines_length_dels>0]
-            arg_min_hor_sort_with_subset = [arg_min_hor_sort[vij] for vij in range(len(start_index_of_hor)) if lines_length_dels[vij] > 0]
-            lines_indexes_deleted_with_subset = [lines_indexes_deleted[vij] for vij in range(len(start_index_of_hor)) if lines_length_dels[vij] > 0]
-            lines_length_dels_with_subset = [lines_length_dels[vij] for vij in range(len(start_index_of_hor)) if lines_length_dels[vij] > 0]
-
-            arg_org_hor_some_sort_subset = [arg_org_hor_some_sort[vij] for vij in range(len(start_index_of_hor)) if lines_length_dels[vij] > 0]
-
-            # arg_min_hor_sort_with_subset=arg_min_hor_sort[lines_length_dels>0]
-            # lines_indexes_deleted_with_subset=lines_indexes_deleted[lines_length_dels>0]
-            # lines_length_dels_with_subset=lines_length_dels[lines_length_dels>0]
-
-            vahid_subset = np.zeros((len(start_index_of_hor_with_subset), len(start_index_of_hor_with_subset))) - 1
-            for kkk1 in range(len(start_index_of_hor_with_subset)):
-
-                index_del_sub = np.unique(lines_indexes_deleted_with_subset[kkk1])
-
-                for kkk2 in range(len(start_index_of_hor_with_subset)):
-
-                    if set(lines_indexes_deleted_with_subset[kkk2][0]) < set(lines_indexes_deleted_with_subset[kkk1][0]):
-                        vahid_subset[kkk1, kkk2] = kkk1
-                    else:
-                        pass
-                # print(set(lines_indexes_deleted[kkk2][0]), set(lines_indexes_deleted[kkk1][0]))
-
-            # print(vahid_subset,'zartt222')
-
-            # check the len of matrix if it has no length means that there is no spliter at all
-
-            if len(vahid_subset > 0):
-                # print('hihoo')
-
-                # find parenets args
-                line_int = np.zeros(vahid_subset.shape[0])
-
-                childs_id = []
-                arg_child = []
-                for li in range(vahid_subset.shape[0]):
-                    # print(vahid_subset[:,li])
-                    if np.all(vahid_subset[:, li] == -1):
-                        line_int[li] = -1
-                    else:
-                        line_int[li] = 1
-
-                        # childs_args_in=[ idd for idd in range(vahid_subset.shape[0]) if vahid_subset[idd,li]!=-1]
-                        # helpi=[]
-                        # for nad in range(len(childs_args_in)):
-                        #    helpi.append(arg_min_hor_sort_with_subset[childs_args_in[nad]])
-
-                        arg_child.append(arg_min_hor_sort_with_subset[li])
-
-                # line_int=vahid_subset[0,:]
-
-                # print(arg_child,line_int[0],'zartt33333')
-                arg_parent = [arg_min_hor_sort_with_subset[vij] for vij in range(len(arg_min_hor_sort_with_subset)) if line_int[vij] == -1]
-                start_index_of_hor_parent = [start_index_of_hor_with_subset[vij] for vij in range(len(arg_min_hor_sort_with_subset)) if line_int[vij] == -1]
-                # arg_parent=[lines_indexes_deleted_with_subset[vij] for vij in range(len(arg_min_hor_sort_with_subset)) if line_int[vij]==-1]
-                # arg_parent=[lines_length_dels_with_subset[vij] for vij in range(len(arg_min_hor_sort_with_subset)) if line_int[vij]==-1]
-
-                # arg_child=[arg_min_hor_sort_with_subset[vij] for vij in range(len(arg_min_hor_sort_with_subset)) if line_int[vij]!=-1]
-                start_index_of_hor_child = [start_index_of_hor_with_subset[vij] for vij in range(len(arg_min_hor_sort_with_subset)) if line_int[vij] != -1]
-
-                cy_hor_some_sort = cy_hor_some[arg_parent]
-
-                # print(start_index_of_hor, lines_length_dels ,lines_indexes_deleted,'zartt')
-
-                # args_indexes=np.array(range(len(start_index_of_hor) ))
-
-                newest_y_spliter_tot = []
-
-                for tj in range(len(newest_peaks) - 1):
-                    newest_y_spliter = []
-                    newest_y_spliter.append(spliter_y_new[i])
-                    if tj in np.unique(start_index_of_hor_parent):
-                        ##print(cy_hor_some_sort)
-                        cy_help = np.array(cy_hor_some_sort)[np.array(start_index_of_hor_parent) == tj]
-                        cy_help_sort = np.sort(cy_help)
-
-                        # print(tj,cy_hor_some_sort,start_index_of_hor,cy_help,'maashhaha')
-                        for mj in range(len(cy_help_sort)):
-                            newest_y_spliter.append(cy_help_sort[mj])
-                    newest_y_spliter.append(spliter_y_new[i + 1])
-
-                    newest_y_spliter_tot.append(newest_y_spliter)
-
-            else:
-                line_int = []
-                newest_y_spliter_tot = []
-
-                for tj in range(len(newest_peaks) - 1):
-                    newest_y_spliter = []
-                    newest_y_spliter.append(spliter_y_new[i])
-
-                    newest_y_spliter.append(spliter_y_new[i + 1])
-
-                    newest_y_spliter_tot.append(newest_y_spliter)
-
-            # if line_int is all -1 means that big spliters have no child and we can easily go through
-            if np.all(np.array(line_int) == -1):
-                for j in range(len(newest_peaks) - 1):
-                    newest_y_spliter = newest_y_spliter_tot[j]
-
-                    for n in range(len(newest_y_spliter) - 1):
-                        # print(j,newest_y_spliter[n],newest_y_spliter[n+1],newest_peaks[j],newest_peaks[j+1],'maaaa')
-                        ##plt.imshow(regions_without_seperators[int(newest_y_spliter[n]):int(newest_y_spliter[n+1]),newest_peaks[j]:newest_peaks[j+1]])
-                        ##plt.show()
-
-                        # print(matrix_new[:,0][ (matrix_new[:,9]==1 )])
-                        for jvt in matrix_new[:, 0][(matrix_new[:, 9] == 1) & (matrix_new[:, 6] > newest_y_spliter[n]) & (matrix_new[:, 7] < newest_y_spliter[n + 1]) & ((matrix_new[:, 1]) < newest_peaks[j + 1]) & ((matrix_new[:, 1]) > newest_peaks[j])]:
-                            pass
-
-                            ###plot_contour(regions_without_seperators.shape[0],regions_without_seperators.shape[1], contours_lines[int(jvt)])
-                        # print(matrix_of_lines_ch[matrix_of_lines_ch[:,9]==1])
-                        matrix_new_new = matrix_of_lines_ch[:, :][(matrix_of_lines_ch[:, 9] == 1) & (matrix_of_lines_ch[:, 6] > newest_y_spliter[n]) & (matrix_of_lines_ch[:, 7] < newest_y_spliter[n + 1]) & ((matrix_of_lines_ch[:, 1] + 500) < newest_peaks[j + 1]) & ((matrix_of_lines_ch[:, 1] - 500) > newest_peaks[j])]
-                        # print(matrix_new_new,newest_y_spliter[n],newest_y_spliter[n+1],newest_peaks[j],newest_peaks[j+1],'gada')
-                        if len(matrix_new_new[:, 9][matrix_new_new[:, 9] == 1]) > 0 and np.max(matrix_new_new[:, 8][matrix_new_new[:, 9] == 1]) >= 0.2 * (np.abs(newest_y_spliter[n + 1] - newest_y_spliter[n])):
-                            num_col_sub, peaks_neg_fin_sub = find_num_col(regions_without_seperators[int(newest_y_spliter[n]) : int(newest_y_spliter[n + 1]), newest_peaks[j] : newest_peaks[j + 1]], multiplier=5.0)
-                        else:
-                            peaks_neg_fin_sub = []
-
-                        peaks_sub = []
-                        peaks_sub.append(newest_peaks[j])
-
-                        for kj in range(len(peaks_neg_fin_sub)):
-                            peaks_sub.append(peaks_neg_fin_sub[kj] + newest_peaks[j])
-
-                        peaks_sub.append(newest_peaks[j + 1])
-
-                        # peaks_sub=return_points_with_boundies(peaks_neg_fin_sub+newest_peaks[j],newest_peaks[j], newest_peaks[j+1])
-
-                        for kh in range(len(peaks_sub) - 1):
-                            boxes.append([peaks_sub[kh], peaks_sub[kh + 1], newest_y_spliter[n], newest_y_spliter[n + 1]])
-
-            else:
-                for j in range(len(newest_peaks) - 1):
-                    newest_y_spliter = newest_y_spliter_tot[j]
-
-                    if j in start_index_of_hor_parent:
-
-                        x_min_ch = x_min_hor_some[arg_child]
-                        x_max_ch = x_max_hor_some[arg_child]
-                        cy_hor_some_sort_child = cy_hor_some[arg_child]
-                        cy_hor_some_sort_child = np.sort(cy_hor_some_sort_child)
-
-                        # print(cy_hor_some_sort_child,'ychilds')
-
-                        for n in range(len(newest_y_spliter) - 1):
-
-                            cy_child_in = cy_hor_some_sort_child[(cy_hor_some_sort_child > newest_y_spliter[n]) & (cy_hor_some_sort_child < newest_y_spliter[n + 1])]
-
-                            if len(cy_child_in) > 0:
-                                num_col_ch, peaks_neg_ch = find_num_col(regions_without_seperators[int(newest_y_spliter[n]) : int(newest_y_spliter[n + 1]), newest_peaks[j] : newest_peaks[j + 1]], multiplier=5.0)
-                                # print(peaks_neg_ch,'mizzzz')
-                                # peaks_neg_ch=[]
-                                # for djh in range(len(peaks_neg_ch)):
-                                #    peaks_neg_ch.append( peaks_neg_ch[djh]+newest_peaks[j] )
-
-                                peaks_neg_ch_tot = return_points_with_boundies(peaks_neg_ch, newest_peaks[j], newest_peaks[j + 1])
-
-                                ss_in_ch, nst_p_ch, arg_n_ch, lines_l_del_ch, lines_in_del_ch = return_hor_spliter_by_index(peaks_neg_ch_tot, x_min_ch, x_max_ch)
-
-                                newest_y_spliter_ch_tot = []
-
-                                for tjj in range(len(nst_p_ch) - 1):
-                                    newest_y_spliter_new = []
-                                    newest_y_spliter_new.append(newest_y_spliter[n])
-                                    if tjj in np.unique(ss_in_ch):
-
-                                        # print(tj,cy_hor_some_sort,start_index_of_hor,cy_help,'maashhaha')
-                                        for mjj in range(len(cy_child_in)):
-                                            newest_y_spliter_new.append(cy_child_in[mjj])
-                                    newest_y_spliter_new.append(newest_y_spliter[n + 1])
-
-                                    newest_y_spliter_ch_tot.append(newest_y_spliter_new)
-
-                                for jn in range(len(nst_p_ch) - 1):
-                                    newest_y_spliter_h = newest_y_spliter_ch_tot[jn]
-
-                                    for nd in range(len(newest_y_spliter_h) - 1):
-
-                                        matrix_new_new2 = matrix_of_lines_ch[:, :][(matrix_of_lines_ch[:, 9] == 1) & (matrix_of_lines_ch[:, 6] > newest_y_spliter_h[nd]) & (matrix_of_lines_ch[:, 7] < newest_y_spliter_h[nd + 1]) & ((matrix_of_lines_ch[:, 1] + 500) < nst_p_ch[jn + 1]) & ((matrix_of_lines_ch[:, 1] - 500) > nst_p_ch[jn])]
-                                        # print(matrix_new_new,newest_y_spliter[n],newest_y_spliter[n+1],newest_peaks[j],newest_peaks[j+1],'gada')
-                                        if len(matrix_new_new2[:, 9][matrix_new_new2[:, 9] == 1]) > 0 and np.max(matrix_new_new2[:, 8][matrix_new_new2[:, 9] == 1]) >= 0.2 * (np.abs(newest_y_spliter_h[nd + 1] - newest_y_spliter_h[nd])):
-                                            num_col_sub_ch, peaks_neg_fin_sub_ch = find_num_col(regions_without_seperators[int(newest_y_spliter_h[nd]) : int(newest_y_spliter_h[nd + 1]), nst_p_ch[jn] : nst_p_ch[jn + 1]], multiplier=5.0)
-
-                                        else:
-                                            peaks_neg_fin_sub_ch = []
-
-                                        peaks_sub_ch = []
-                                        peaks_sub_ch.append(nst_p_ch[jn])
-
-                                        for kjj in range(len(peaks_neg_fin_sub_ch)):
-                                            peaks_sub_ch.append(peaks_neg_fin_sub_ch[kjj] + nst_p_ch[jn])
-
-                                        peaks_sub_ch.append(nst_p_ch[jn + 1])
-
-                                        # peaks_sub=return_points_with_boundies(peaks_neg_fin_sub+newest_peaks[j],newest_peaks[j], newest_peaks[j+1])
-
-                                        for khh in range(len(peaks_sub_ch) - 1):
-                                            boxes.append([peaks_sub_ch[khh], peaks_sub_ch[khh + 1], newest_y_spliter_h[nd], newest_y_spliter_h[nd + 1]])
-
-                            else:
-
-                                matrix_new_new = matrix_of_lines_ch[:, :][(matrix_of_lines_ch[:, 9] == 1) & (matrix_of_lines_ch[:, 6] > newest_y_spliter[n]) & (matrix_of_lines_ch[:, 7] < newest_y_spliter[n + 1]) & ((matrix_of_lines_ch[:, 1] + 500) < newest_peaks[j + 1]) & ((matrix_of_lines_ch[:, 1] - 500) > newest_peaks[j])]
-                                # print(matrix_new_new,newest_y_spliter[n],newest_y_spliter[n+1],newest_peaks[j],newest_peaks[j+1],'gada')
-                                if len(matrix_new_new[:, 9][matrix_new_new[:, 9] == 1]) > 0 and np.max(matrix_new_new[:, 8][matrix_new_new[:, 9] == 1]) >= 0.2 * (np.abs(newest_y_spliter[n + 1] - newest_y_spliter[n])):
-                                    num_col_sub, peaks_neg_fin_sub = find_num_col(regions_without_seperators[int(newest_y_spliter[n]) : int(newest_y_spliter[n + 1]), newest_peaks[j] : newest_peaks[j + 1]], multiplier=5.0)
-                                else:
-                                    peaks_neg_fin_sub = []
-
-                                peaks_sub = []
-                                peaks_sub.append(newest_peaks[j])
-
-                                for kj in range(len(peaks_neg_fin_sub)):
-                                    peaks_sub.append(peaks_neg_fin_sub[kj] + newest_peaks[j])
-
-                                peaks_sub.append(newest_peaks[j + 1])
-
-                                # peaks_sub=return_points_with_boundies(peaks_neg_fin_sub+newest_peaks[j],newest_peaks[j], newest_peaks[j+1])
-
-                                for kh in range(len(peaks_sub) - 1):
-                                    boxes.append([peaks_sub[kh], peaks_sub[kh + 1], newest_y_spliter[n], newest_y_spliter[n + 1]])
-
-                    else:
-                        for n in range(len(newest_y_spliter) - 1):
-
-                            # plot_contour(regions_without_seperators.shape[0],regions_without_seperators.shape[1], contours_lines[int(jvt)])
-                            # print(matrix_of_lines_ch[matrix_of_lines_ch[:,9]==1])
-                            matrix_new_new = matrix_of_lines_ch[:, :][(matrix_of_lines_ch[:, 9] == 1) & (matrix_of_lines_ch[:, 6] > newest_y_spliter[n]) & (matrix_of_lines_ch[:, 7] < newest_y_spliter[n + 1]) & ((matrix_of_lines_ch[:, 1] + 500) < newest_peaks[j + 1]) & ((matrix_of_lines_ch[:, 1] - 500) > newest_peaks[j])]
-                            # print(matrix_new_new,newest_y_spliter[n],newest_y_spliter[n+1],newest_peaks[j],newest_peaks[j+1],'gada')
-                            if len(matrix_new_new[:, 9][matrix_new_new[:, 9] == 1]) > 0 and np.max(matrix_new_new[:, 8][matrix_new_new[:, 9] == 1]) >= 0.2 * (np.abs(newest_y_spliter[n + 1] - newest_y_spliter[n])):
-                                num_col_sub, peaks_neg_fin_sub = find_num_col(regions_without_seperators[int(newest_y_spliter[n]) : int(newest_y_spliter[n + 1]), newest_peaks[j] : newest_peaks[j + 1]], multiplier=5.0)
-                            else:
-                                peaks_neg_fin_sub = []
-
-                            peaks_sub = []
-                            peaks_sub.append(newest_peaks[j])
-
-                            for kj in range(len(peaks_neg_fin_sub)):
-                                peaks_sub.append(peaks_neg_fin_sub[kj] + newest_peaks[j])
-
-                            peaks_sub.append(newest_peaks[j + 1])
-
-                            # peaks_sub=return_points_with_boundies(peaks_neg_fin_sub+newest_peaks[j],newest_peaks[j], newest_peaks[j+1])
-
-                            for kh in range(len(peaks_sub) - 1):
-                                boxes.append([peaks_sub[kh], peaks_sub[kh + 1], newest_y_spliter[n], newest_y_spliter[n + 1]])
-
-        else:
-            boxes.append([0, seperators_closeup_n[:, :, 0].shape[1], spliter_y_new[i], spliter_y_new[i + 1]])
-
-    return boxes
-
-def return_boxes_of_images_by_order_of_reading_without_seperators_2cols(spliter_y_new, image_p_rev, regions_without_seperators, matrix_of_lines_ch, seperators_closeup_n):
-
-    boxes = []
-
-    # here I go through main spliters and i do check whether a vertical seperator there is. If so i am searching for \
-    # holes in the text and also finding spliter which covers more than one columns.
-    for i in range(len(spliter_y_new) - 1):
-        # print(spliter_y_new[i],spliter_y_new[i+1])
-        matrix_new = matrix_of_lines_ch[:, :][(matrix_of_lines_ch[:, 6] > spliter_y_new[i]) & (matrix_of_lines_ch[:, 7] < spliter_y_new[i + 1])]
-        # print(len( matrix_new[:,9][matrix_new[:,9]==1] ))
-
-        # print(matrix_new[:,8][matrix_new[:,9]==1],'gaddaaa')
-
-        # check to see is there any vertical seperator to find holes.
-        if np.abs(spliter_y_new[i + 1] - spliter_y_new[i]) > 1.0 / 3.0 * regions_without_seperators.shape[0]:  # len( matrix_new[:,9][matrix_new[:,9]==1] )>0 and np.max(matrix_new[:,8][matrix_new[:,9]==1])>=0.1*(np.abs(spliter_y_new[i+1]-spliter_y_new[i] )):
-
-            # org_img_dichte=-gaussian_filter1d(( image_page[int(spliter_y_new[i]):int(spliter_y_new[i+1]),:,0]/255.).sum(axis=0) ,30)
-            # org_img_dichte=org_img_dichte-np.min(org_img_dichte)
-            ##plt.figure(figsize=(20,20))
-            ##plt.plot(org_img_dichte)
-            ##plt.show()
-            ###find_num_col_both_layout_and_org(regions_without_seperators,image_page[int(spliter_y_new[i]):int(spliter_y_new[i+1]),:,:],7.)
-
-            try:
-                num_col, peaks_neg_fin = find_num_col_only_image(image_p_rev[int(spliter_y_new[i]) : int(spliter_y_new[i + 1]), :], multiplier=2.4)
-            except:
-                peaks_neg_fin = []
-                num_col = 0
-
-            peaks_neg_tot = return_points_with_boundies(peaks_neg_fin, 0, seperators_closeup_n[:, :, 0].shape[1])
-
-            for kh in range(len(peaks_neg_tot) - 1):
-                boxes.append([peaks_neg_tot[kh], peaks_neg_tot[kh + 1], spliter_y_new[i], spliter_y_new[i + 1]])
-        else:
-            boxes.append([0, seperators_closeup_n[:, :, 0].shape[1], spliter_y_new[i], spliter_y_new[i + 1]])
-
-    return boxes
-
-def add_tables_heuristic_to_layout(image_regions_eraly_p, boxes, slope_mean_hor, spliter_y, peaks_neg_tot, image_revised):
-
-    image_revised_1 = delete_seperator_around(spliter_y, peaks_neg_tot, image_revised)
-    img_comm_e = np.zeros(image_revised_1.shape)
-    img_comm = np.repeat(img_comm_e[:, :, np.newaxis], 3, axis=2)
-
-    for indiv in np.unique(image_revised_1):
-
-        # print(indiv,'indd')
-        image_col = (image_revised_1 == indiv) * 255
-        img_comm_in = np.repeat(image_col[:, :, np.newaxis], 3, axis=2)
-        img_comm_in = img_comm_in.astype(np.uint8)
-
-        imgray = cv2.cvtColor(img_comm_in, cv2.COLOR_BGR2GRAY)
-
-        ret, thresh = cv2.threshold(imgray, 0, 255, 0)
-
-        contours, hirarchy = cv2.findContours(thresh.copy(), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
-
-        main_contours = filter_contours_area_of_image_tables(thresh, contours, hirarchy, max_area=1, min_area=0.0001)
-
-        img_comm = cv2.fillPoly(img_comm, pts=main_contours, color=(indiv, indiv, indiv))
-        ###img_comm_in=cv2.fillPoly(img_comm, pts =interior_contours, color=(0,0,0))
-
-        # img_comm=np.repeat(img_comm[:, :, np.newaxis], 3, axis=2)
-        img_comm = img_comm.astype(np.uint8)
-
-    if not isNaN(slope_mean_hor):
-        image_revised_last = np.zeros((image_regions_eraly_p.shape[0], image_regions_eraly_p.shape[1], 3))
-        for i in range(len(boxes)):
-
-            image_box = img_comm[int(boxes[i][2]) : int(boxes[i][3]), int(boxes[i][0]) : int(boxes[i][1]), :]
-
-            image_box_tabels_1 = (image_box[:, :, 0] == 7) * 1
-
-            contours_tab, _ = return_contours_of_image(image_box_tabels_1)
-
-            contours_tab = filter_contours_area_of_image_tables(image_box_tabels_1, contours_tab, _, 1, 0.001)
-
-            image_box_tabels_1 = (image_box[:, :, 0] == 6) * 1
-
-            image_box_tabels_and_m_text = ((image_box[:, :, 0] == 7) | (image_box[:, :, 0] == 1)) * 1
-            image_box_tabels_and_m_text = image_box_tabels_and_m_text.astype(np.uint8)
-
-            image_box_tabels_1 = image_box_tabels_1.astype(np.uint8)
-            image_box_tabels_1 = cv2.dilate(image_box_tabels_1, self.kernel, iterations=5)
-
-            contours_table_m_text, _ = return_contours_of_image(image_box_tabels_and_m_text)
-
-            image_box_tabels = np.repeat(image_box_tabels_1[:, :, np.newaxis], 3, axis=2)
-
-            image_box_tabels = image_box_tabels.astype(np.uint8)
-            imgray = cv2.cvtColor(image_box_tabels, cv2.COLOR_BGR2GRAY)
-            ret, thresh = cv2.threshold(imgray, 0, 255, 0)
-
-            contours_line, hierachy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
-
-            y_min_main_line, y_max_main_line, _ = find_features_of_contours(contours_line)
-            # _,_,y_min_main_line ,y_max_main_line,x_min_main_line,x_max_main_line=find_new_features_of_contoures(contours_line)
-            y_min_main_tab, y_max_main_tab, _ = find_features_of_contours(contours_tab)
-
-            cx_tab_m_text, cy_tab_m_text, x_min_tab_m_text, x_max_tab_m_text, y_min_tab_m_text, y_max_tab_m_text = find_new_features_of_contoures(contours_table_m_text)
-            cx_tabl, cy_tabl, x_min_tabl, x_max_tabl, y_min_tabl, y_max_tabl, _ = find_new_features_of_contoures(contours_tab)
-
-            if len(y_min_main_tab) > 0:
-                y_down_tabs = []
-                y_up_tabs = []
-
-                for i_t in range(len(y_min_main_tab)):
-                    y_down_tab = []
-                    y_up_tab = []
-                    for i_l in range(len(y_min_main_line)):
-                        if y_min_main_tab[i_t] > y_min_main_line[i_l] and y_max_main_tab[i_t] > y_min_main_line[i_l] and y_min_main_tab[i_t] > y_max_main_line[i_l] and y_max_main_tab[i_t] > y_min_main_line[i_l]:
-                            pass
-                        elif y_min_main_tab[i_t] < y_max_main_line[i_l] and y_max_main_tab[i_t] < y_max_main_line[i_l] and y_max_main_tab[i_t] < y_min_main_line[i_l] and y_min_main_tab[i_t] < y_min_main_line[i_l]:
-                            pass
-                        elif np.abs(y_max_main_line[i_l] - y_min_main_line[i_l]) < 100:
-                            pass
-
-                        else:
-                            y_up_tab.append(np.min([y_min_main_line[i_l], y_min_main_tab[i_t]]))
-                            y_down_tab.append(np.max([y_max_main_line[i_l], y_max_main_tab[i_t]]))
-
-                    if len(y_up_tab) == 0:
-                        for v_n in range(len(cx_tab_m_text)):
-                            if cx_tabl[i_t] <= x_max_tab_m_text[v_n] and cx_tabl[i_t] >= x_min_tab_m_text[v_n] and cy_tabl[i_t] <= y_max_tab_m_text[v_n] and cy_tabl[i_t] >= y_min_tab_m_text[v_n] and cx_tabl[i_t] != cx_tab_m_text[v_n] and cy_tabl[i_t] != cy_tab_m_text[v_n]:
-                                y_up_tabs.append(y_min_tab_m_text[v_n])
-                                y_down_tabs.append(y_max_tab_m_text[v_n])
-                        # y_up_tabs.append(y_min_main_tab[i_t])
-                        # y_down_tabs.append(y_max_main_tab[i_t])
-                    else:
-                        y_up_tabs.append(np.min(y_up_tab))
-                        y_down_tabs.append(np.max(y_down_tab))
-
-            else:
-                y_down_tabs = []
-                y_up_tabs = []
-                pass
-
-            for ii in range(len(y_up_tabs)):
-                image_box[y_up_tabs[ii] : y_down_tabs[ii], :, 0] = 7
-
-            image_revised_last[int(boxes[i][2]) : int(boxes[i][3]), int(boxes[i][0]) : int(boxes[i][1]), :] = image_box[:, :, :]
-
-    else:
-        for i in range(len(boxes)):
-
-            image_box = img_comm[int(boxes[i][2]) : int(boxes[i][3]), int(boxes[i][0]) : int(boxes[i][1]), :]
-            image_revised_last[int(boxes[i][2]) : int(boxes[i][3]), int(boxes[i][0]) : int(boxes[i][1]), :] = image_box[:, :, :]
-
-            ##plt.figure(figsize=(20,20))
-            ##plt.imshow(image_box[:,:,0])
-            ##plt.show()
-    return image_revised_last
-
-def get_regions_from_xy_2models_ens(self, img):
-    img_org = np.copy(img)
-
-    img_height_h = img_org.shape[0]
-    img_width_h = img_org.shape[1]
-
-    model_region, session_region = self.start_new_session_and_model(self.model_region_dir_p_ens)
-
-    gaussian_filter = False
-    patches = False
-    binary = False
-
-    ratio_x = 1
-    ratio_y = 1
-    img = resize_image(img_org, int(img_org.shape[0] * ratio_y), int(img_org.shape[1] * ratio_x))
-
-    prediction_regions_long = self.do_prediction(patches, img, model_region)
-
-    prediction_regions_long = resize_image(prediction_regions_long, img_height_h, img_width_h)
-
-    gaussian_filter = False
-    patches = True
-    binary = False
-
-    ratio_x = 1
-    ratio_y = 1.2
-    median_blur = False
-
-    img = resize_image(img_org, int(img_org.shape[0] * ratio_y), int(img_org.shape[1] * ratio_x))
-
-    if binary:
-        img = otsu_copy_binary(img)  # otsu_copy(img)
-        img = img.astype(np.uint16)
-
-    if median_blur:
-        img = cv2.medianBlur(img, 5)
-    if gaussian_filter:
-        img = cv2.GaussianBlur(img, (5, 5), 0)
-        img = img.astype(np.uint16)
-    prediction_regions_org_y = self.do_prediction(patches, img, model_region)
-
-    prediction_regions_org_y = resize_image(prediction_regions_org_y, img_height_h, img_width_h)
-
-    # plt.imshow(prediction_regions_org[:,:,0])
-    # plt.show()
-    # sys.exit()
-    prediction_regions_org_y = prediction_regions_org_y[:, :, 0]
-
-    mask_zeros_y = (prediction_regions_org_y[:, :] == 0) * 1
-
-    ratio_x = 1.2
-    ratio_y = 1
-    median_blur = False
-
-    img = resize_image(img_org, int(img_org.shape[0] * ratio_y), int(img_org.shape[1] * ratio_x))
-
-    if binary:
-        img = otsu_copy_binary(img)  # otsu_copy(img)
-        img = img.astype(np.uint16)
-
-    if median_blur:
-        img = cv2.medianBlur(img, 5)
-    if gaussian_filter:
-        img = cv2.GaussianBlur(img, (5, 5), 0)
-        img = img.astype(np.uint16)
-    prediction_regions_org = self.do_prediction(patches, img, model_region)
-
-    prediction_regions_org = resize_image(prediction_regions_org, img_height_h, img_width_h)
-
-    # plt.imshow(prediction_regions_org[:,:,0])
-    # plt.show()
-    # sys.exit()
-    prediction_regions_org = prediction_regions_org[:, :, 0]
-
-    prediction_regions_org[(prediction_regions_org[:, :] == 1) & (mask_zeros_y[:, :] == 1)] = 0
-
-    prediction_regions_org[(prediction_regions_long[:, :, 0] == 1) & (prediction_regions_org[:, :] == 2)] = 1
-
-    session_region.close()
-    del model_region
-    del session_region
-    gc.collect()
-
-    return prediction_regions_org
-
-def resize_and_enhance_image(self, is_image_enhanced):
-    dpi = self.check_dpi()
-    img = cv2.imread(self.image_dir)
-    img = img.astype(np.uint8)
-    # sys.exit()
-
-    print(dpi)
-
-    if dpi < 298:
-        if img.shape[0] < 1000:
-            img_h_new = int(img.shape[0] * 3)
-            img_w_new = int(img.shape[1] * 3)
-            if img_h_new < 2800:
-                img_h_new = 3000
-                img_w_new = int(img.shape[1] / float(img.shape[0]) * 3000)
-        elif img.shape[0] >= 1000 and img.shape[0] < 2000:
-            img_h_new = int(img.shape[0] * 2)
-            img_w_new = int(img.shape[1] * 2)
-            if img_h_new < 2800:
-                img_h_new = 3000
-                img_w_new = int(img.shape[1] / float(img.shape[0]) * 3000)
-        else:
-            img_h_new = int(img.shape[0] * 1.5)
-            img_w_new = int(img.shape[1] * 1.5)
-        img_new = resize_image(img, img_h_new, img_w_new)
-        image_res = self.predict_enhancement(img_new)
-        # cv2.imwrite(os.path.join(self.dir_out, self.f_name) + ".tif",self.image)
-        # self.image=self.image.astype(np.uint16)
-
-        # self.scale_x=1
-        # self.scale_y=1
-        # self.height_org = self.image.shape[0]
-        # self.width_org = self.image.shape[1]
-        is_image_enhanced = True
-    else:
-        is_image_enhanced = False
-        image_res = np.copy(img)
-
-    return is_image_enhanced, img, image_res
-
-def resize_and_enhance_image_new(self, is_image_enhanced):
-    # self.check_dpi()
-    img = cv2.imread(self.image_dir)
-    img = img.astype(np.uint8)
-    # sys.exit()
-
-    image_res = np.copy(img)
-
-    return is_image_enhanced, img, image_res
-
-def get_image_and_scales_deskewd(self, img_deskewd):
-
-    self.image = img_deskewd
-    self.image_org = np.copy(self.image)
-    self.height_org = self.image.shape[0]
-    self.width_org = self.image.shape[1]
-
-    self.img_hight_int = int(self.image.shape[0] * 1)
-    self.img_width_int = int(self.image.shape[1] * 1)
-    self.scale_y = self.img_hight_int / float(self.image.shape[0])
-    self.scale_x = self.img_width_int / float(self.image.shape[1])
-
-    self.image = resize_image(self.image, self.img_hight_int, self.img_width_int)
-
-def extract_drop_capital_13(self, img, patches, cols):
-
-    img_height_h = img.shape[0]
-    img_width_h = img.shape[1]
-    patches = False
-
-    img = otsu_copy_binary(img)  # otsu_copy(img)
-    img = img.astype(np.uint16)
-
-    model_region, session_region = self.start_new_session_and_model(self.model_region_dir_fully_np)
-
-    img_1 = img[: int(img.shape[0] / 3.0), :, :]
-    img_2 = img[int(img.shape[0] / 3.0) : int(2 * img.shape[0] / 3.0), :, :]
-    img_3 = img[int(2 * img.shape[0] / 3.0) :, :, :]
-
-    # img_1 = otsu_copy_binary(img_1)#otsu_copy(img)
-    # img_1 = img_1.astype(np.uint16)
-
-    plt.imshow(img_1)
-    plt.show()
-    # img_2 = otsu_copy_binary(img_2)#otsu_copy(img)
-    # img_2 = img_2.astype(np.uint16)
-
-    plt.imshow(img_2)
-    plt.show()
-    # img_3 = otsu_copy_binary(img_3)#otsu_copy(img)
-    # img_3 = img_3.astype(np.uint16)
-
-    plt.imshow(img_3)
-    plt.show()
-
-    prediction_regions_1 = self.do_prediction(patches, img_1, model_region)
-
-    plt.imshow(prediction_regions_1)
-    plt.show()
-
-    prediction_regions_2 = self.do_prediction(patches, img_2, model_region)
-
-    plt.imshow(prediction_regions_2)
-    plt.show()
-    prediction_regions_3 = self.do_prediction(patches, img_3, model_region)
-
-    plt.imshow(prediction_regions_3)
-    plt.show()
-    prediction_regions = np.zeros((img_height_h, img_width_h))
-
-    prediction_regions[: int(img.shape[0] / 3.0), :] = prediction_regions_1[:, :, 0]
-    prediction_regions[int(img.shape[0] / 3.0) : int(2 * img.shape[0] / 3.0), :] = prediction_regions_2[:, :, 0]
-    prediction_regions[int(2 * img.shape[0] / 3.0) :, :] = prediction_regions_3[:, :, 0]
-
-    session_region.close()
-    del img_1
-    del img_2
-    del img_3
-    del prediction_regions_1
-    del prediction_regions_2
-    del prediction_regions_3
-    del model_region
-    del session_region
-    del img
-    gc.collect()
-    return prediction_regions
-
-def extract_only_text_regions(self, img, patches):
-
-    model_region, session_region = self.start_new_session_and_model(self.model_only_text)
-    img = otsu_copy_binary(img)  # otsu_copy(img)
-    img = img.astype(np.uint8)
-    img_org = np.copy(img)
-
-    img_h = img_org.shape[0]
-    img_w = img_org.shape[1]
-
-    img = resize_image(img_org, int(img_org.shape[0] * 1), int(img_org.shape[1] * 1))
-
-    prediction_regions1 = self.do_prediction(patches, img, model_region)
-
-    prediction_regions1 = resize_image(prediction_regions1, img_h, img_w)
-
-    # prediction_regions1 = cv2.dilate(prediction_regions1, self.kernel, iterations=4)
-    # prediction_regions1 = cv2.erode(prediction_regions1, self.kernel, iterations=7)
-    # prediction_regions1 = cv2.dilate(prediction_regions1, self.kernel, iterations=2)
-
-    img = resize_image(img_org, int(img_org.shape[0] * 1), int(img_org.shape[1] * 1))
-
-    prediction_regions2 = self.do_prediction(patches, img, model_region)
-
-    prediction_regions2 = resize_image(prediction_regions2, img_h, img_w)
-
-    # prediction_regions2 = cv2.dilate(prediction_regions2, self.kernel, iterations=2)
-    prediction_regions2 = cv2.erode(prediction_regions2, self.kernel, iterations=2)
-    prediction_regions2 = cv2.dilate(prediction_regions2, self.kernel, iterations=2)
-
-    # prediction_regions=(  (prediction_regions2[:,:,0]==1) & (prediction_regions1[:,:,0]==1) )
-    # prediction_regions=(prediction_regions1[:,:,0]==1)
-
-    session_region.close()
-    del model_region
-    del session_region
-    gc.collect()
-    return prediction_regions1[:, :, 0]
-
-def extract_binarization(self, img, patches):
-
-    model_bin, session_bin = self.start_new_session_and_model(self.model_binafrization)
-
-    img_h = img.shape[0]
-    img_w = img.shape[1]
-
-    img = resize_image(img, int(img.shape[0] * 1), int(img.shape[1] * 1))
-
-    prediction_regions = self.do_prediction(patches, img, model_bin)
-
-    res = (prediction_regions[:, :, 0] != 0) * 1
-
-    img_fin = np.zeros((res.shape[0], res.shape[1], 3))
-    res[:, :][res[:, :] == 0] = 2
-    res = res - 1
-    res = res * 255
-    img_fin[:, :, 0] = res
-    img_fin[:, :, 1] = res
-    img_fin[:, :, 2] = res
-
-    session_bin.close()
-    del model_bin
-    del session_bin
-    gc.collect()
-    # plt.imshow(img_fin[:,:,0])
-    # plt.show()
-    return img_fin
-
-def get_text_region_contours_and_boxes(self, image):
-    rgb_class_of_texts = (1, 1, 1)
-    mask_texts = np.all(image == rgb_class_of_texts, axis=-1)
-
-    image = np.repeat(mask_texts[:, :, np.newaxis], 3, axis=2) * 255
-    image = image.astype(np.uint8)
-
-    image = cv2.morphologyEx(image, cv2.MORPH_OPEN, self.kernel)
-    image = cv2.morphologyEx(image, cv2.MORPH_CLOSE, self.kernel)
-
-    imgray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-
-    _, thresh = cv2.threshold(imgray, 0, 255, 0)
-
-    contours, hirarchy = cv2.findContours(thresh.copy(), cv2.cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
-
-    main_contours = filter_contours_area_of_image(thresh, contours, hirarchy, max_area=1, min_area=0.00001)
-    self.boxes = []
-
-    for jj in range(len(main_contours)):
-        x, y, w, h = cv2.boundingRect(main_contours[jj])
-        self.boxes.append([x, y, w, h])
-
-    return main_contours
-
-def textline_contours_to_get_slope_correctly(self, textline_mask, img_patch, contour_interest):
-
-    slope_new = 0  # deskew_images(img_patch)
-
-    textline_mask = np.repeat(textline_mask[:, :, np.newaxis], 3, axis=2) * 255
-
-    textline_mask = textline_mask.astype(np.uint8)
-    textline_mask = cv2.morphologyEx(textline_mask, cv2.MORPH_OPEN, self.kernel)
-    textline_mask = cv2.morphologyEx(textline_mask, cv2.MORPH_CLOSE, self.kernel)
-    textline_mask = cv2.erode(textline_mask, self.kernel, iterations=1)
-    imgray = cv2.cvtColor(textline_mask, cv2.COLOR_BGR2GRAY)
-    _, thresh = cv2.threshold(imgray, 0, 255, 0)
-
-    thresh = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, self.kernel)
-    thresh = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, self.kernel)
-
-    contours, hirarchy = cv2.findContours(thresh.copy(), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
-
-    main_contours = filter_contours_area_of_image_tables(thresh, contours, hirarchy, max_area=1, min_area=0.003)
-
-    textline_maskt = textline_mask[:, :, 0]
-    textline_maskt[textline_maskt != 0] = 1
-
-    peaks_point, _ = seperate_lines(textline_maskt, contour_interest, slope_new)
-
-    mean_dis = np.mean(np.diff(peaks_point))
-
-    len_x = thresh.shape[1]
-
-    slope_lines = []
-    contours_slope_new = []
-
-    for kk in range(len(main_contours)):
-
-        if len(main_contours[kk].shape) == 2:
-            xminh = np.min(main_contours[kk][:, 0])
-            xmaxh = np.max(main_contours[kk][:, 0])
-
-            yminh = np.min(main_contours[kk][:, 1])
-            ymaxh = np.max(main_contours[kk][:, 1])
-        elif len(main_contours[kk].shape) == 3:
-            xminh = np.min(main_contours[kk][:, 0, 0])
-            xmaxh = np.max(main_contours[kk][:, 0, 0])
-
-            yminh = np.min(main_contours[kk][:, 0, 1])
-            ymaxh = np.max(main_contours[kk][:, 0, 1])
-
-        if ymaxh - yminh <= mean_dis and (xmaxh - xminh) >= 0.3 * len_x:  # xminh>=0.05*len_x and xminh<=0.4*len_x and xmaxh<=0.95*len_x and xmaxh>=0.6*len_x:
-            contours_slope_new.append(main_contours[kk])
-
-            rows, cols = thresh.shape[:2]
-            [vx, vy, x, y] = cv2.fitLine(main_contours[kk], cv2.DIST_L2, 0, 0.01, 0.01)
-
-            slope_lines.append((vy / vx) / np.pi * 180)
-
-        if len(slope_lines) >= 2:
-
-            slope = np.mean(slope_lines)  # slope_true/np.pi*180
-        else:
-            slope = 999
-
-    else:
-        slope = 0
-
-    return slope
-
-
-def return_deskew_slope_new(self, img_patch, sigma_des):
-    max_x_y = max(img_patch.shape[0], img_patch.shape[1])
-
-    ##img_patch=resize_image(img_patch,max_x_y,max_x_y)
-
-    img_patch_copy = np.zeros((img_patch.shape[0], img_patch.shape[1]))
-    img_patch_copy[:, :] = img_patch[:, :]  # img_patch_org[:,:,0]
-
-    img_patch_padded = np.zeros((int(max_x_y * (1.4)), int(max_x_y * (1.4))))
-
-    img_patch_padded_center_p = int(img_patch_padded.shape[0] / 2.0)
-    len_x_org_patch_half = int(img_patch_copy.shape[1] / 2.0)
-    len_y_org_patch_half = int(img_patch_copy.shape[0] / 2.0)
-
-    img_patch_padded[img_patch_padded_center_p - len_y_org_patch_half : img_patch_padded_center_p - len_y_org_patch_half + img_patch_copy.shape[0], img_patch_padded_center_p - len_x_org_patch_half : img_patch_padded_center_p - len_x_org_patch_half + img_patch_copy.shape[1]] = img_patch_copy[:, :]
-    # img_patch_padded[ int( img_patch_copy.shape[0]*(.1)):int( img_patch_copy.shape[0]*(.1))+img_patch_copy.shape[0] , int( img_patch_copy.shape[1]*(.8)):int( img_patch_copy.shape[1]*(.8))+img_patch_copy.shape[1] ]=img_patch_copy[:,:]
-    angles = np.linspace(-25, 25, 80)
-
-    res = []
-    num_of_peaks = []
-    index_cor = []
-    var_res = []
-
-    # plt.imshow(img_patch)
-    # plt.show()
-    indexer = 0
-    for rot in angles:
-        # print(rot,'rot')
-        img_rotated = rotate_image(img_patch_padded, rot)
-        img_rotated[img_rotated != 0] = 1
-
-        # plt.imshow(img_rotated)
-        # plt.show()
-
-        try:
-            neg_peaks, var_spectrum = self.get_standard_deviation_of_summed_textline_patch_along_width(img_rotated, sigma_des, 20.3)
-            res_me = np.mean(neg_peaks)
-            if res_me == 0:
-                res_me = VERY_LARGE_NUMBER
-            else:
-                pass
-
-            res_num = len(neg_peaks)
-        except:
-            res_me = VERY_LARGE_NUMBER
-            res_num = 0
-            var_spectrum = 0
-        if isNaN(res_me):
-            pass
-        else:
-            res.append(res_me)
-            var_res.append(var_spectrum)
-            num_of_peaks.append(res_num)
-            index_cor.append(indexer)
-        indexer = indexer + 1
-
-    try:
-        var_res = np.array(var_res)
-        # print(var_res)
-
-        ang_int = angles[np.argmax(var_res)]  # angels_sorted[arg_final]#angels[arg_sort_early[arg_sort[arg_final]]]#angels[arg_fin]
-    except:
-        ang_int = 0
-
-    if abs(ang_int) > 15:
-        angles = np.linspace(-90, -50, 30)
-        res = []
-        num_of_peaks = []
-        index_cor = []
-        var_res = []
-
-        # plt.imshow(img_patch)
-        # plt.show()
-        indexer = 0
-        for rot in angles:
-            # print(rot,'rot')
-            img_rotated = rotate_image(img_patch_padded, rot)
-            img_rotated[img_rotated != 0] = 1
-
-            # plt.imshow(img_rotated)
-            # plt.show()
-
-            try:
-                neg_peaks, var_spectrum = self.get_standard_deviation_of_summed_textline_patch_along_width(img_rotated, sigma_des, 20.3)
-                res_me = np.mean(neg_peaks)
-                if res_me == 0:
-                    res_me = VERY_LARGE_NUMBER
-                else:
-                    pass
-
-                res_num = len(neg_peaks)
-            except:
-                res_me = VERY_LARGE_NUMBER
-                res_num = 0
-                var_spectrum = 0
-            if isNaN(res_me):
-                pass
-            else:
-                res.append(res_me)
-                var_res.append(var_spectrum)
-                num_of_peaks.append(res_num)
-                index_cor.append(indexer)
-            indexer = indexer + 1
-
-        try:
-            var_res = np.array(var_res)
-            # print(var_res)
-
-            ang_int = angles[np.argmax(var_res)]  # angels_sorted[arg_final]#angels[arg_sort_early[arg_sort[arg_final]]]#angels[arg_fin]
-        except:
-            ang_int = 0
-
-    return ang_int
-
-def get_slopes_and_deskew(self, contours, textline_mask_tot):
-
-    slope_biggest = 0  # return_deskew_slop(img_int_p,sigma_des, dir_of_all=self.dir_of_all, f_name=self.f_name)
-
-    num_cores = cpu_count()
-    q = Queue()
-    poly = Queue()
-    box_sub = Queue()
-
-    processes = []
-    nh = np.linspace(0, len(self.boxes), num_cores + 1)
-
-    for i in range(num_cores):
-        boxes_per_process = self.boxes[int(nh[i]) : int(nh[i + 1])]
-        contours_per_process = contours[int(nh[i]) : int(nh[i + 1])]
-        processes.append(Process(target=self.do_work_of_slopes, args=(q, poly, box_sub, boxes_per_process, textline_mask_tot, contours_per_process)))
-
-    for i in range(num_cores):
-        processes[i].start()
-
-    self.slopes = []
-    self.all_found_texline_polygons = []
-    self.boxes = []
-
-    for i in range(num_cores):
-        slopes_for_sub_process = q.get(True)
-        boxes_for_sub_process = box_sub.get(True)
-        polys_for_sub_process = poly.get(True)
-
-        for j in range(len(slopes_for_sub_process)):
-            self.slopes.append(slopes_for_sub_process[j])
-            self.all_found_texline_polygons.append(polys_for_sub_process[j])
-            self.boxes.append(boxes_for_sub_process[j])
-
-    for i in range(num_cores):
-        processes[i].join()
-
-
-def write_into_page_xml_only_textlines(self, contours, page_coord, all_found_texline_polygons, all_box_coord, dir_of_image):
-
-    found_polygons_text_region = contours
-
-    # create the file structure
-    data = ET.Element("PcGts")
-
-    data.set("xmlns", "http://schema.primaresearch.org/PAGE/gts/pagecontent/2017-07-15")
-    data.set("xmlns:xsi", "http://www.w3.org/2001/XMLSchema-instance")
-    data.set("xsi:schemaLocation", "http://schema.primaresearch.org/PAGE/gts/pagecontent/2017-07-15")
-
-    metadata = ET.SubElement(data, "Metadata")
-
-    author = ET.SubElement(metadata, "Creator")
-    author.text = "SBB_QURATOR"
-
-    created = ET.SubElement(metadata, "Created")
-    created.text = "2019-06-17T18:15:12"
-
-    changetime = ET.SubElement(metadata, "LastChange")
-    changetime.text = "2019-06-17T18:15:12"
-
-    page = ET.SubElement(data, "Page")
-
-    page.set("imageFilename", self.image_dir)
-    page.set("imageHeight", str(self.height_org))
-    page.set("imageWidth", str(self.width_org))
-    page.set("type", "content")
-    page.set("readingDirection", "left-to-right")
-    page.set("textLineOrder", "top-to-bottom")
-
-    page_print_sub = ET.SubElement(page, "PrintSpace")
-    coord_page = ET.SubElement(page_print_sub, "Coords")
-    points_page_print = ""
-
-    for lmm in range(len(self.cont_page[0])):
-        if len(self.cont_page[0][lmm]) == 2:
-            points_page_print = points_page_print + str(int((self.cont_page[0][lmm][0]) / self.scale_x))
-            points_page_print = points_page_print + ","
-            points_page_print = points_page_print + str(int((self.cont_page[0][lmm][1]) / self.scale_y))
-        else:
-            points_page_print = points_page_print + str(int((self.cont_page[0][lmm][0][0]) / self.scale_x))
-            points_page_print = points_page_print + ","
-            points_page_print = points_page_print + str(int((self.cont_page[0][lmm][0][1]) / self.scale_y))
-
-        if lmm < (len(self.cont_page[0]) - 1):
-            points_page_print = points_page_print + " "
-    coord_page.set("points", points_page_print)
-
-    if len(contours) > 0:
-
-        id_indexer = 0
-        id_indexer_l = 0
-
-        for mm in range(len(found_polygons_text_region)):
-            textregion = ET.SubElement(page, "TextRegion")
-
-            textregion.set("id", "r" + str(id_indexer))
-            id_indexer += 1
-
-            textregion.set("type", "paragraph")
-            # if mm==0:
-            #    textregion.set('type','header')
-            # else:
-            #    textregion.set('type','paragraph')
-            coord_text = ET.SubElement(textregion, "Coords")
-
-            points_co = ""
-            for lmm in range(len(found_polygons_text_region[mm])):
-                if len(found_polygons_text_region[mm][lmm]) == 2:
-                    points_co = points_co + str(int((found_polygons_text_region[mm][lmm][0] + page_coord[2]) / self.scale_x))
-                    points_co = points_co + ","
-                    points_co = points_co + str(int((found_polygons_text_region[mm][lmm][1] + page_coord[0]) / self.scale_y))
-                else:
-                    points_co = points_co + str(int((found_polygons_text_region[mm][lmm][0][0] + page_coord[2]) / self.scale_x))
-                    points_co = points_co + ","
-                    points_co = points_co + str(int((found_polygons_text_region[mm][lmm][0][1] + page_coord[0]) / self.scale_y))
-
-                if lmm < (len(found_polygons_text_region[mm]) - 1):
-                    points_co = points_co + " "
-            # print(points_co)
-            coord_text.set("points", points_co)
-
-            for j in range(len(all_found_texline_polygons[mm])):
-
-                textline = ET.SubElement(textregion, "TextLine")
-
-                textline.set("id", "l" + str(id_indexer_l))
-
-                id_indexer_l += 1
-
-                coord = ET.SubElement(textline, "Coords")
-
-                texteq = ET.SubElement(textline, "TextEquiv")
-
-                uni = ET.SubElement(texteq, "Unicode")
-                uni.text = " "
-
-                # points = ET.SubElement(coord, 'Points')
-
-                points_co = ""
-                for l in range(len(all_found_texline_polygons[mm][j])):
-                    # point = ET.SubElement(coord, 'Point')
-
-                    # point.set('x',str(found_polygons[j][l][0]))
-                    # point.set('y',str(found_polygons[j][l][1]))
-                    if len(all_found_texline_polygons[mm][j][l]) == 2:
-                        points_co = points_co + str(int((all_found_texline_polygons[mm][j][l][0] + page_coord[2]) / self.scale_x))
-                        points_co = points_co + ","
-                        points_co = points_co + str(int((all_found_texline_polygons[mm][j][l][1] + page_coord[0]) / self.scale_y))
-                    else:
-                        points_co = points_co + str(int((all_found_texline_polygons[mm][j][l][0][0] + page_coord[2]) / self.scale_x))
-                        points_co = points_co + ","
-                        points_co = points_co + str(int((all_found_texline_polygons[mm][j][l][0][1] + page_coord[0]) / self.scale_y))
-
-                    if l < (len(all_found_texline_polygons[mm][j]) - 1):
-                        points_co = points_co + " "
-                # print(points_co)
-                coord.set("points", points_co)
-
-            texteqreg = ET.SubElement(textregion, "TextEquiv")
-
-            unireg = ET.SubElement(texteqreg, "Unicode")
-            unireg.text = " "
-
-    # print(dir_of_image)
-    print(self.f_name)
-    # print(os.path.join(dir_of_image, self.f_name) + ".xml")
-    tree = ET.ElementTree(data)
-    tree.write(os.path.join(dir_of_image, self.f_name) + ".xml")
-
-def return_teilwiese_deskewed_lines(self, text_regions_p, textline_rotated):
-
-    kernel = np.ones((5, 5), np.uint8)
-    textline_rotated = cv2.erode(textline_rotated, kernel, iterations=1)
-
-    textline_rotated_new = np.zeros(textline_rotated.shape)
-    rgb_m = 1
-    rgb_h = 2
-
-    cnt_m, boxes_m = return_contours_of_interested_region_and_bounding_box(text_regions_p, rgb_m)
-    cnt_h, boxes_h = return_contours_of_interested_region_and_bounding_box(text_regions_p, rgb_h)
-
-    areas_cnt_m = np.array([cv2.contourArea(cnt_m[j]) for j in range(len(cnt_m))])
-
-    argmax = np.argmax(areas_cnt_m)
-
-    # plt.imshow(textline_rotated[ boxes_m[argmax][1]:boxes_m[argmax][1]+boxes_m[argmax][3] ,boxes_m[argmax][0]:boxes_m[argmax][0]+boxes_m[argmax][2]])
-    # plt.show()
-
-    for argmax in range(len(boxes_m)):
-
-        textline_text_region = textline_rotated[boxes_m[argmax][1] : boxes_m[argmax][1] + boxes_m[argmax][3], boxes_m[argmax][0] : boxes_m[argmax][0] + boxes_m[argmax][2]]
-
-        textline_text_region_revised = self.seperate_lines_new(textline_text_region, 0)
-        # except:
-        #    textline_text_region_revised=textline_rotated[ boxes_m[argmax][1]:boxes_m[argmax][1]+boxes_m[argmax][3] ,boxes_m[argmax][0]:boxes_m[argmax][0]+boxes_m[argmax][2]  ]
-        textline_rotated_new[boxes_m[argmax][1] : boxes_m[argmax][1] + boxes_m[argmax][3], boxes_m[argmax][0] : boxes_m[argmax][0] + boxes_m[argmax][2]] = textline_text_region_revised[:, :]
-
-    # textline_rotated_new[textline_rotated_new>0]=1
-    # textline_rotated_new[textline_rotated_new<0]=0
-    # plt.imshow(textline_rotated_new)
-    # plt.show()
-
-def get_regions_from_xy_neu(self, img):
-    img_org = np.copy(img)
-
-    img_height_h = img_org.shape[0]
-    img_width_h = img_org.shape[1]
-
-    model_region, session_region = self.start_new_session_and_model(self.model_region_dir_p)
-
-    gaussian_filter = False
-    patches = True
-    binary = True
-
-    ratio_x = 1
-    ratio_y = 1
-    median_blur = False
-
-    img = resize_image(img_org, int(img_org.shape[0] * ratio_y), int(img_org.shape[1] * ratio_x))
-
-    if binary:
-        img = otsu_copy_binary(img)  # otsu_copy(img)
-        img = img.astype(np.uint16)
-
-    if median_blur:
-        img = cv2.medianBlur(img, 5)
-    if gaussian_filter:
-        img = cv2.GaussianBlur(img, (5, 5), 0)
-        img = img.astype(np.uint16)
-    prediction_regions_org = self.do_prediction(patches, img, model_region)
-
-    prediction_regions_org = resize_image(prediction_regions_org, img_height_h, img_width_h)
-
-    # plt.imshow(prediction_regions_org[:,:,0])
-    # plt.show()
-    # sys.exit()
-    prediction_regions_org = prediction_regions_org[:, :, 0]
-
-    gaussian_filter = False
-    patches = False
-    binary = False
-
-    ratio_x = 1
-    ratio_y = 1
-    median_blur = False
-
-    img = resize_image(img_org, int(img_org.shape[0] * ratio_y), int(img_org.shape[1] * ratio_x))
-
-    if binary:
-        img = otsu_copy_binary(img)  # otsu_copy(img)
-        img = img.astype(np.uint16)
-
-    if median_blur:
-        img = cv2.medianBlur(img, 5)
-        img = cv2.medianBlur(img, 5)
-    if gaussian_filter:
-        img = cv2.GaussianBlur(img, (5, 5), 0)
-        img = img.astype(np.uint16)
-    prediction_regions_orgt = self.do_prediction(patches, img, model_region)
-
-    prediction_regions_orgt = resize_image(prediction_regions_orgt, img_height_h, img_width_h)
-
-    # plt.imshow(prediction_regions_orgt[:,:,0])
-    # plt.show()
-    # sys.exit()
-    prediction_regions_orgt = prediction_regions_orgt[:, :, 0]
-
-    mask_texts_longshot = (prediction_regions_orgt[:, :] == 1) * 1
-
-    mask_texts_longshot = np.uint8(mask_texts_longshot)
-    # mask_texts_longshot = cv2.dilate(mask_texts_longshot[:,:], self.kernel, iterations=2)
-
-    pixel_img = 1
-    polygons_of_only_texts_longshot = return_contours_of_interested_region(mask_texts_longshot, pixel_img)
-
-    longshot_true = np.zeros(mask_texts_longshot.shape)
-    # text_regions_p_true[:,:]=text_regions_p_1[:,:]
-
-    longshot_true = cv2.fillPoly(longshot_true, pts=polygons_of_only_texts_longshot, color=(1, 1, 1))
-
-    # plt.imshow(longshot_true)
-    # plt.show()
-
-    gaussian_filter = False
-    patches = False
-    binary = False
-
-    ratio_x = 1
-    ratio_y = 1
-    median_blur = False
-
-    img = resize_image(img_org, int(img_org.shape[0] * ratio_y), int(img_org.shape[1] * ratio_x))
-
-    one_third_upper_ny = int(img.shape[0] / 3.0)
-
-    img = img[0:one_third_upper_ny, :, :]
-
-    if binary:
-        img = otsu_copy_binary(img)  # otsu_copy(img)
-        img = img.astype(np.uint16)
-
-    if median_blur:
-        img = cv2.medianBlur(img, 5)
-
-    if gaussian_filter:
-        img = cv2.GaussianBlur(img, (5, 5), 0)
-        img = img.astype(np.uint16)
-    prediction_regions_longshot_one_third = self.do_prediction(patches, img, model_region)
-
-    prediction_regions_longshot_one_third = resize_image(prediction_regions_longshot_one_third, one_third_upper_ny, img_width_h)
-
-    img = resize_image(img_org, int(img_org.shape[0] * ratio_y), int(img_org.shape[1] * ratio_x))
-    img = img[one_third_upper_ny : int(2 * one_third_upper_ny), :, :]
-
-    if binary:
-        img = otsu_copy_binary(img)  # otsu_copy(img)
-        img = img.astype(np.uint16)
-
-    if median_blur:
-        img = cv2.medianBlur(img, 5)
-
-    if gaussian_filter:
-        img = cv2.GaussianBlur(img, (5, 5), 0)
-        img = img.astype(np.uint16)
-    prediction_regions_longshot_one_third_middle = self.do_prediction(patches, img, model_region)
-
-    prediction_regions_longshot_one_third_middle = resize_image(prediction_regions_longshot_one_third_middle, one_third_upper_ny, img_width_h)
-
-    img = resize_image(img_org, int(img_org.shape[0] * ratio_y), int(img_org.shape[1] * ratio_x))
-    img = img[int(2 * one_third_upper_ny) :, :, :]
-
-    if binary:
-        img = otsu_copy_binary(img)  # otsu_copy(img)
-        img = img.astype(np.uint16)
-
-    if median_blur:
-        img = cv2.medianBlur(img, 5)
-
-    if gaussian_filter:
-        img = cv2.GaussianBlur(img, (5, 5), 0)
-        img = img.astype(np.uint16)
-    prediction_regions_longshot_one_third_down = self.do_prediction(patches, img, model_region)
-
-    prediction_regions_longshot_one_third_down = resize_image(prediction_regions_longshot_one_third_down, img_height_h - int(2 * one_third_upper_ny), img_width_h)
-
-    # plt.imshow(prediction_regions_org[:,:,0])
-    # plt.show()
-    # sys.exit()
-    prediction_regions_longshot = np.zeros((img_height_h, img_width_h))
-
-    # prediction_regions_longshot=prediction_regions_longshot[:,:,0]
-
-    # prediction_regions_longshot[0:one_third_upper_ny,:]=prediction_regions_longshot_one_third[:,:,0]
-    # prediction_regions_longshot[one_third_upper_ny:int(2*one_third_upper_ny):,:]=prediction_regions_longshot_one_third_middle[:,:,0]
-    # prediction_regions_longshot[int(2*one_third_upper_ny):,:]=prediction_regions_longshot_one_third_down[:,:,0]
-
-    prediction_regions_longshot = longshot_true[:, :]
-    # plt.imshow(prediction_regions_longshot)
-    # plt.show()
-
-    gaussian_filter = False
-    patches = True
-    binary = False
-
-    ratio_x = 1  # 1.1
-    ratio_y = 1
-    median_blur = False
-
-    # img= resize_image(img_org, int(img_org.shape[0]*0.8), int(img_org.shape[1]*1.6))
-    img = resize_image(img_org, int(img_org.shape[0] * ratio_y), int(img_org.shape[1] * ratio_x))
-
-    if binary:
-        img = otsu_copy_binary(img)  # otsu_copy(img)
-        img = img.astype(np.uint16)
-
-    if median_blur:
-        img = cv2.medianBlur(img, 5)
-    if gaussian_filter:
-        img = cv2.GaussianBlur(img, (5, 5), 0)
-        img = img.astype(np.uint16)
-
-    prediction_regions = self.do_prediction(patches, img, model_region)
-    text_region1 = resize_image(prediction_regions, img_height_h, img_width_h)
-
-    # plt.imshow(text_region1[:,:,0])
-    # plt.show()
-    ratio_x = 1
-    ratio_y = 1.2  # 1.3
-    binary = False
-    median_blur = False
-
-    img = resize_image(img_org, int(img_org.shape[0] * ratio_y), int(img_org.shape[1] * ratio_x))
-
-    if binary:
-        img = otsu_copy_binary(img)  # otsu_copy(img)
-        img = img.astype(np.uint16)
-
-    if median_blur:
-        img = cv2.medianBlur(img, 5)
-    if gaussian_filter:
-        img = cv2.GaussianBlur(img, (5, 5), 0)
-        img = img.astype(np.uint16)
-
-    prediction_regions = self.do_prediction(patches, img, model_region)
-    text_region2 = resize_image(prediction_regions, img_height_h, img_width_h)
-
-    # plt.imshow(text_region2[:,:,0])
-    # plt.show()
-    session_region.close()
-    del model_region
-    del session_region
-    gc.collect()
-
-    # text_region1=text_region1[:,:,0]
-    # text_region2=text_region2[:,:,0]
-
-    # text_region1[(text_region1[:,:]==2) & (text_region2[:,:]==1)]=1
-
-    mask_zeros_from_1 = (text_region2[:, :, 0] == 0) * 1
-    # mask_text_from_1=(text_region1[:,:,0]==1)*1
-
-    mask_img_text_region1 = (text_region1[:, :, 0] == 2) * 1
-    text_region2_1st_channel = text_region1[:, :, 0]
-
-    text_region2_1st_channel[mask_zeros_from_1 == 1] = 0
-
-    ##text_region2_1st_channel[mask_img_text_region1[:,:]==1]=2
-    # text_region2_1st_channel[(mask_text_from_1==1) & (text_region2_1st_channel==2)]=1
-
-    mask_lines1 = (text_region1[:, :, 0] == 3) * 1
-    mask_lines2 = (text_region2[:, :, 0] == 3) * 1
-
-    mask_lines2[mask_lines1[:, :] == 1] = 1
-
-    # plt.imshow(text_region2_1st_channel)
-    # plt.show()
-
-    text_region2_1st_channel = cv2.erode(text_region2_1st_channel[:, :], self.kernel, iterations=4)
-
-    # plt.imshow(text_region2_1st_channel)
-    # plt.show()
-
-    text_region2_1st_channel = cv2.dilate(text_region2_1st_channel[:, :], self.kernel, iterations=4)
-
-    text_region2_1st_channel[mask_lines2[:, :] == 1] = 3
-
-    # text_region2_1st_channel[ (prediction_regions_org[:,:]==1) & (text_region2_1st_channel[:,:]==2)]=1
-
-    # only in the case of model 3
-
-    text_region2_1st_channel[(prediction_regions_longshot[:, :] == 1) & (text_region2_1st_channel[:, :] == 2)] = 1
-
-    text_region2_1st_channel[(prediction_regions_org[:, :] == 2) & (text_region2_1st_channel[:, :] == 0)] = 2
-
-    # text_region2_1st_channel[prediction_regions_org[:,:]==0]=0
-
-    # plt.imshow(text_region2_1st_channel)
-    # plt.show()
-
-    # text_region2_1st_channel[:,:400]=0
-
-    mask_texts_only = (text_region2_1st_channel[:, :] == 1) * 1
-
-    mask_images_only = (text_region2_1st_channel[:, :] == 2) * 1
-
-    mask_lines_only = (text_region2_1st_channel[:, :] == 3) * 1
-
-    pixel_img = 1
-    polygons_of_only_texts = return_contours_of_interested_region(mask_texts_only, pixel_img)
-
-    polygons_of_only_images = return_contours_of_interested_region(mask_images_only, pixel_img)
-
-    polygons_of_only_lines = return_contours_of_interested_region(mask_lines_only, pixel_img)
-
-    text_regions_p_true = np.zeros(text_region2_1st_channel.shape)
-    # text_regions_p_true[:,:]=text_regions_p_1[:,:]
-
-    text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts=polygons_of_only_lines, color=(3, 3, 3))
-
-    text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts=polygons_of_only_images, color=(2, 2, 2))
-
-    text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts=polygons_of_only_texts, color=(1, 1, 1))
-
-    ##print(np.unique(text_regions_p_true))
-
-    # text_regions_p_true_3d=np.repeat(text_regions_p_1[:, :, np.newaxis], 3, axis=2)
-    # text_regions_p_true_3d=text_regions_p_true_3d.astype(np.uint8)
-
-    return text_regions_p_true  # text_region2_1st_channel
-
-def get_regions_from_xy(self, img):
-    img_org = np.copy(img)
-
-    img_height_h = img_org.shape[0]
-    img_width_h = img_org.shape[1]
-
-    model_region, session_region = self.start_new_session_and_model(self.model_region_dir_p)
-
-    gaussian_filter = False
-    patches = True
-    binary = True
-
-    ratio_x = 1
-    ratio_y = 1
-    median_blur = False
-
-    if binary:
-        img = otsu_copy_binary(img)  # otsu_copy(img)
-        img = img.astype(np.uint16)
-
-    if median_blur:
-        img = cv2.medianBlur(img, 5)
-
-    if gaussian_filter:
-        img = cv2.GaussianBlur(img, (5, 5), 0)
-        img = img.astype(np.uint16)
-    prediction_regions_org = self.do_prediction(patches, img, model_region)
-
-    ###plt.imshow(prediction_regions_org[:,:,0])
-    ###plt.show()
-    ##sys.exit()
-    prediction_regions_org = prediction_regions_org[:, :, 0]
-
-    gaussian_filter = False
-    patches = True
-    binary = False
-
-    ratio_x = 1.1
-    ratio_y = 1
-    median_blur = False
-
-    # img= resize_image(img_org, int(img_org.shape[0]*0.8), int(img_org.shape[1]*1.6))
-    img = resize_image(img_org, int(img_org.shape[0] * ratio_y), int(img_org.shape[1] * ratio_x))
-
-    if binary:
-        img = otsu_copy_binary(img)  # otsu_copy(img)
-        img = img.astype(np.uint16)
-
-    if median_blur:
-        img = cv2.medianBlur(img, 5)
-    if gaussian_filter:
-        img = cv2.GaussianBlur(img, (5, 5), 0)
-        img = img.astype(np.uint16)
-
-    prediction_regions = self.do_prediction(patches, img, model_region)
-    text_region1 = resize_image(prediction_regions, img_height_h, img_width_h)
-
-    ratio_x = 1
-    ratio_y = 1.1
-    binary = False
-    median_blur = False
-
-    img = resize_image(img_org, int(img_org.shape[0] * ratio_y), int(img_org.shape[1] * ratio_x))
-
-    if binary:
-        img = otsu_copy_binary(img)  # otsu_copy(img)
-        img = img.astype(np.uint16)
-
-    if median_blur:
-        img = cv2.medianBlur(img, 5)
-    if gaussian_filter:
-        img = cv2.GaussianBlur(img, (5, 5), 0)
-        img = img.astype(np.uint16)
-
-    prediction_regions = self.do_prediction(patches, img, model_region)
-    text_region2 = resize_image(prediction_regions, img_height_h, img_width_h)
-
-    session_region.close()
-    del model_region
-    del session_region
-    gc.collect()
-
-    mask_zeros_from_1 = (text_region1[:, :, 0] == 0) * 1
-    # mask_text_from_1=(text_region1[:,:,0]==1)*1
-
-    mask_img_text_region1 = (text_region1[:, :, 0] == 2) * 1
-    text_region2_1st_channel = text_region2[:, :, 0]
-
-    text_region2_1st_channel[mask_zeros_from_1 == 1] = 0
-
-    text_region2_1st_channel[mask_img_text_region1[:, :] == 1] = 2
-    # text_region2_1st_channel[(mask_text_from_1==1) & (text_region2_1st_channel==2)]=1
-
-    mask_lines1 = (text_region1[:, :, 0] == 3) * 1
-    mask_lines2 = (text_region2[:, :, 0] == 3) * 1
-
-    mask_lines2[mask_lines1[:, :] == 1] = 1
-
-    ##plt.imshow(text_region2_1st_channel)
-    ##plt.show()
-
-    text_region2_1st_channel = cv2.erode(text_region2_1st_channel[:, :], self.kernel, iterations=5)
-
-    ##plt.imshow(text_region2_1st_channel)
-    ##plt.show()
-
-    text_region2_1st_channel = cv2.dilate(text_region2_1st_channel[:, :], self.kernel, iterations=5)
-
-    text_region2_1st_channel[mask_lines2[:, :] == 1] = 3
-
-    text_region2_1st_channel[(prediction_regions_org[:, :] == 1) & (text_region2_1st_channel[:, :] == 2)] = 1
-    text_region2_1st_channel[prediction_regions_org[:, :] == 3] = 3
-
-    ##plt.imshow(text_region2_1st_channel)
-    ##plt.show()
-    return text_region2_1st_channel
-
-def do_work_of_textline_seperation(self, queue_of_all_params, polygons_per_process, index_polygons_per_process, con_par_org, textline_mask_tot, mask_texts_only, num_col, scale_par, boxes_text):
-
-    textregions_cnt_tot_per_process = []
-    textlines_cnt_tot_per_process = []
-    index_polygons_per_process_per_process = []
-    polygons_per_par_process_per_process = []
-    textline_cnt_seperated = np.zeros(textline_mask_tot.shape)
-    for iiii in range(len(polygons_per_process)):
-        # crop_img,crop_coor=crop_image_inside_box(boxes_text[mv],image_page_rotated)
-        # arg_max=np.argmax(areas_cnt_only_text)
-        textregions_cnt_tot_per_process.append(polygons_per_process[iiii] / scale_par)
-        textline_region_in_image = np.zeros(textline_mask_tot.shape)
-        cnt_o_t_max = polygons_per_process[iiii]
-
-        x, y, w, h = cv2.boundingRect(cnt_o_t_max)
-
-        mask_biggest = np.zeros(mask_texts_only.shape)
-        mask_biggest = cv2.fillPoly(mask_biggest, pts=[cnt_o_t_max], color=(1, 1, 1))
-
-        mask_region_in_patch_region = mask_biggest[y : y + h, x : x + w]
-
-        textline_biggest_region = mask_biggest * textline_mask_tot
-
-        textline_rotated_seperated = self.seperate_lines_new2(textline_biggest_region[y : y + h, x : x + w], 0, num_col)
-
-        # new line added
-        ##print(np.shape(textline_rotated_seperated),np.shape(mask_biggest))
-        textline_rotated_seperated[mask_region_in_patch_region[:, :] != 1] = 0
-        # till here
-
-        textline_cnt_seperated[y : y + h, x : x + w] = textline_rotated_seperated
-        textline_region_in_image[y : y + h, x : x + w] = textline_rotated_seperated
-
-        # plt.imshow(textline_region_in_image)
-        # plt.show()
-
-        # plt.imshow(textline_cnt_seperated)
-        # plt.show()
-
-        pixel_img = 1
-        cnt_textlines_in_image = return_contours_of_interested_textline(textline_region_in_image, pixel_img)
-
-        textlines_cnt_per_region = []
-        for jjjj in range(len(cnt_textlines_in_image)):
-            mask_biggest2 = np.zeros(mask_texts_only.shape)
-            mask_biggest2 = cv2.fillPoly(mask_biggest2, pts=[cnt_textlines_in_image[jjjj]], color=(1, 1, 1))
-            if num_col + 1 == 1:
-                mask_biggest2 = cv2.dilate(mask_biggest2, self.kernel, iterations=5)
-            else:
-
-                mask_biggest2 = cv2.dilate(mask_biggest2, self.kernel, iterations=4)
-
-            pixel_img = 1
-            cnt_textlines_in_image_ind = return_contours_of_interested_textline(mask_biggest2, pixel_img)
-
-            try:
-                textlines_cnt_per_region.append(cnt_textlines_in_image_ind[0] / scale_par)
-            except:
-                pass
-            # print(len(cnt_textlines_in_image_ind))
-
-            # plt.imshow(mask_biggest2)
-            # plt.show()
-        textlines_cnt_tot_per_process.append(textlines_cnt_per_region)
-        index_polygons_per_process_per_process.append(index_polygons_per_process[iiii])
-        polygons_per_par_process_per_process.append(con_par_org[iiii])
-
-    queue_of_all_params.put([index_polygons_per_process_per_process, polygons_per_par_process_per_process, textregions_cnt_tot_per_process, textlines_cnt_tot_per_process])
-
-
-def seperate_lines_new(img_path, thetha, num_col, dir_of_all, f_name):
-
-    if num_col == 1:
-        num_patches = int(img_path.shape[1] / 200.0)
-    else:
-        num_patches = int(img_path.shape[1] / 100.0)
-    # num_patches=int(img_path.shape[1]/200.)
-    if num_patches == 0:
-        num_patches = 1
-    (h, w) = img_path.shape[:2]
-    center = (w // 2, h // 2)
-    M = cv2.getRotationMatrix2D(center, -thetha, 1.0)
-    x_d = M[0, 2]
-    y_d = M[1, 2]
-
-    thetha = thetha / 180.0 * np.pi
-    rotation_matrix = np.array([[np.cos(thetha), -np.sin(thetha)], [np.sin(thetha), np.cos(thetha)]])
-
-    x_min_cont = 0
-    x_max_cont = img_path.shape[1]
-    y_min_cont = 0
-    y_max_cont = img_path.shape[0]
-
-    xv = np.linspace(x_min_cont, x_max_cont, 1000)
-
-    mada_n = img_path.sum(axis=1)
-
-    ##plt.plot(mada_n)
-    ##plt.show()
-    first_nonzero = 0  # (next((i for i, x in enumerate(mada_n) if x), None))
-
-    y = mada_n[:]  # [first_nonzero:last_nonzero]
-    y_help = np.zeros(len(y) + 40)
-    y_help[20 : len(y) + 20] = y
-    x = np.array(range(len(y)))
-
-    peaks_real, _ = find_peaks(gaussian_filter1d(y, 3), height=0)
-    if len(peaks_real) <= 2 and len(peaks_real) > 1:
-        sigma_gaus = 10
-    else:
-        sigma_gaus = 6
-
-    z = gaussian_filter1d(y_help, sigma_gaus)
-    zneg_rev = -y_help + np.max(y_help)
-    zneg = np.zeros(len(zneg_rev) + 40)
-    zneg[20 : len(zneg_rev) + 20] = zneg_rev
-    zneg = gaussian_filter1d(zneg, sigma_gaus)
-
-    peaks, _ = find_peaks(z, height=0)
-    peaks_neg, _ = find_peaks(zneg, height=0)
-
-    for nn in range(len(peaks_neg)):
-        if peaks_neg[nn] > len(z) - 1:
-            peaks_neg[nn] = len(z) - 1
-        if peaks_neg[nn] < 0:
-            peaks_neg[nn] = 0
-
-    diff_peaks = np.abs(np.diff(peaks_neg))
-    cut_off = 20
-    peaks_neg_true = []
-    forest = []
-
-    for i in range(len(peaks_neg)):
-        if i == 0:
-            forest.append(peaks_neg[i])
-        if i < (len(peaks_neg) - 1):
-            if diff_peaks[i] <= cut_off:
-                forest.append(peaks_neg[i + 1])
-            if diff_peaks[i] > cut_off:
-                # print(forest[np.argmin(z[forest]) ] )
-                if not isNaN(forest[np.argmin(z[forest])]):
-                    # print(len(z),forest)
-                    peaks_neg_true.append(forest[np.argmin(z[forest])])
-                forest = []
-                forest.append(peaks_neg[i + 1])
-        if i == (len(peaks_neg) - 1):
-            # print(print(forest[np.argmin(z[forest]) ] ))
-            if not isNaN(forest[np.argmin(z[forest])]):
-
-                peaks_neg_true.append(forest[np.argmin(z[forest])])
-
-    peaks_neg_true = np.array(peaks_neg_true)
-
-    """
-    #plt.figure(figsize=(40,40))
-    #plt.subplot(1,2,1)
-    #plt.title('Textline segmentation von Textregion')
-    #plt.imshow(img_path)
-    #plt.xlabel('X')
-    #plt.ylabel('Y')
-    #plt.subplot(1,2,2)
-    #plt.title('Dichte entlang X')
-    #base = pyplot.gca().transData
-    #rot = transforms.Affine2D().rotate_deg(90)
-    #plt.plot(zneg,np.array(range(len(zneg))))
-    #plt.plot(zneg[peaks_neg_true],peaks_neg_true,'*')
-    #plt.gca().invert_yaxis()
-
-    #plt.xlabel('Dichte')
-    #plt.ylabel('Y')
-    ##plt.plot([0,len(y)], [grenze,grenze])
-    #plt.show()
-    """
-
-    peaks_neg_true = peaks_neg_true - 20 - 20
-    peaks = peaks - 20
-
-    # dis_up=peaks_neg_true[14]-peaks_neg_true[0]
-    # dis_down=peaks_neg_true[18]-peaks_neg_true[14]
-
-    img_patch_ineterst = img_path[:, :]  # [peaks_neg_true[14]-dis_up:peaks_neg_true[15]+dis_down ,:]
-
-    ##plt.imshow(img_patch_ineterst)
-    ##plt.show()
-
-    length_x = int(img_path.shape[1] / float(num_patches))
-    margin = int(0.04 * length_x)
-
-    width_mid = length_x - 2 * margin
-
-    nxf = img_path.shape[1] / float(width_mid)
-
-    if nxf > int(nxf):
-        nxf = int(nxf) + 1
-    else:
-        nxf = int(nxf)
-
-    slopes_tile_wise = []
-    for i in range(nxf):
-        if i == 0:
-            index_x_d = i * width_mid
-            index_x_u = index_x_d + length_x
-        elif i > 0:
-            index_x_d = i * width_mid
-            index_x_u = index_x_d + length_x
-
-        if index_x_u > img_path.shape[1]:
-            index_x_u = img_path.shape[1]
-            index_x_d = img_path.shape[1] - length_x
-
-        # img_patch = img[index_y_d:index_y_u, index_x_d:index_x_u, :]
-        img_xline = img_patch_ineterst[:, index_x_d:index_x_u]
-
-        sigma = 2
-        try:
-            slope_xline = return_deskew_slop(img_xline, sigma, dir_of_all=dir_of_all, f_name=f_name)
-        except:
-            slope_xline = 0
-        slopes_tile_wise.append(slope_xline)
-        # print(slope_xline,'xlineeee')
-        img_line_rotated = rotate_image(img_xline, slope_xline)
-        img_line_rotated[:, :][img_line_rotated[:, :] != 0] = 1
-
-    """
-
-    xline=np.linspace(0,img_path.shape[1],nx)
-    slopes_tile_wise=[]
-
-    for ui in range( nx-1 ):
-        img_xline=img_patch_ineterst[:,int(xline[ui]):int(xline[ui+1])]
-
-
-        ##plt.imshow(img_xline)
-        ##plt.show()
-
-        sigma=3
-        try:
-            slope_xline=return_deskew_slop(img_xline,sigma, dir_of_all=self.dir_of_all, f_name=self.f_name)
-        except:
-            slope_xline=0
-        slopes_tile_wise.append(slope_xline)
-        print(slope_xline,'xlineeee')
-        img_line_rotated=rotate_image(img_xline,slope_xline)
-
-        ##plt.imshow(img_line_rotated)
-        ##plt.show()
-    """
-
-    # dis_up=peaks_neg_true[14]-peaks_neg_true[0]
-    # dis_down=peaks_neg_true[18]-peaks_neg_true[14]
-
-    img_patch_ineterst = img_path[:, :]  # [peaks_neg_true[14]-dis_up:peaks_neg_true[14]+dis_down ,:]
-
-    img_patch_ineterst_revised = np.zeros(img_patch_ineterst.shape)
-
-    for i in range(nxf):
-        if i == 0:
-            index_x_d = i * width_mid
-            index_x_u = index_x_d + length_x
-        elif i > 0:
-            index_x_d = i * width_mid
-            index_x_u = index_x_d + length_x
-
-        if index_x_u > img_path.shape[1]:
-            index_x_u = img_path.shape[1]
-            index_x_d = img_path.shape[1] - length_x
-
-        img_xline = img_patch_ineterst[:, index_x_d:index_x_u]
-
-        img_int = np.zeros((img_xline.shape[0], img_xline.shape[1]))
-        img_int[:, :] = img_xline[:, :]  # img_patch_org[:,:,0]
-
-        img_resized = np.zeros((int(img_int.shape[0] * (1.2)), int(img_int.shape[1] * (3))))
-
-        img_resized[int(img_int.shape[0] * (0.1)) : int(img_int.shape[0] * (0.1)) + img_int.shape[0], int(img_int.shape[1] * (1)) : int(img_int.shape[1] * (1)) + img_int.shape[1]] = img_int[:, :]
-        ##plt.imshow(img_xline)
-        ##plt.show()
-        img_line_rotated = rotate_image(img_resized, slopes_tile_wise[i])
-        img_line_rotated[:, :][img_line_rotated[:, :] != 0] = 1
-
-        img_patch_seperated = seperate_lines_new_inside_teils(img_line_rotated, 0)
-
-        ##plt.imshow(img_patch_seperated)
-        ##plt.show()
-        img_patch_seperated_returned = rotate_image(img_patch_seperated, -slopes_tile_wise[i])
-        img_patch_seperated_returned[:, :][img_patch_seperated_returned[:, :] != 0] = 1
-
-        img_patch_seperated_returned_true_size = img_patch_seperated_returned[int(img_int.shape[0] * (0.1)) : int(img_int.shape[0] * (0.1)) + img_int.shape[0], int(img_int.shape[1] * (1)) : int(img_int.shape[1] * (1)) + img_int.shape[1]]
-
-        img_patch_seperated_returned_true_size = img_patch_seperated_returned_true_size[:, margin : length_x - margin]
-        img_patch_ineterst_revised[:, index_x_d + margin : index_x_u - margin] = img_patch_seperated_returned_true_size
-
-    """
-    for ui in range( nx-1 ):
-        img_xline=img_patch_ineterst[:,int(xline[ui]):int(xline[ui+1])]
-
-
-        img_int=np.zeros((img_xline.shape[0],img_xline.shape[1]))
-        img_int[:,:]=img_xline[:,:]#img_patch_org[:,:,0]
-
-        img_resized=np.zeros((int( img_int.shape[0]*(1.2) ) , int( img_int.shape[1]*(3) ) ))
-
-        img_resized[ int( img_int.shape[0]*(.1)):int( img_int.shape[0]*(.1))+img_int.shape[0] , int( img_int.shape[1]*(1)):int( img_int.shape[1]*(1))+img_int.shape[1] ]=img_int[:,:]
-        ##plt.imshow(img_xline)
-        ##plt.show()
-        img_line_rotated=rotate_image(img_resized,slopes_tile_wise[ui])
-
-
-        #img_patch_seperated = seperate_lines_new_inside_teils(img_line_rotated,0)
-
-        img_patch_seperated = seperate_lines_new_inside_teils(img_line_rotated,0)
-
-        img_patch_seperated_returned=rotate_image(img_patch_seperated,-slopes_tile_wise[ui])
-        ##plt.imshow(img_patch_seperated)
-        ##plt.show()
-        print(img_patch_seperated_returned.shape)
-        #plt.imshow(img_patch_seperated_returned[ int( img_int.shape[0]*(.1)):int( img_int.shape[0]*(.1))+img_int.shape[0] , int( img_int.shape[1]*(1)):int( img_int.shape[1]*(1))+img_int.shape[1] ])
-        #plt.show()
-
-        img_patch_ineterst_revised[:,int(xline[ui]):int(xline[ui+1])]=img_patch_seperated_returned[ int( img_int.shape[0]*(.1)):int( img_int.shape[0]*(.1))+img_int.shape[0] , int( img_int.shape[1]*(1)):int( img_int.shape[1]*(1))+img_int.shape[1] ]
-
-
-    """
-
-    # print(img_patch_ineterst_revised.shape,np.unique(img_patch_ineterst_revised))
-    ##plt.imshow(img_patch_ineterst_revised)
-    ##plt.show()
-    return img_patch_ineterst_revised
-
-def return_contours_of_interested_region_and_bounding_box(region_pre_p, pixel):
-
-    # pixels of images are identified by 5
-    cnts_images = (region_pre_p[:, :, 0] == pixel) * 1
-    cnts_images = cnts_images.astype(np.uint8)
-    cnts_images = np.repeat(cnts_images[:, :, np.newaxis], 3, axis=2)
-    imgray = cv2.cvtColor(cnts_images, cv2.COLOR_BGR2GRAY)
-    ret, thresh = cv2.threshold(imgray, 0, 255, 0)
-    contours_imgs, hiearchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
-
-    contours_imgs = return_parent_contours(contours_imgs, hiearchy)
-    contours_imgs = filter_contours_area_of_image_tables(thresh, contours_imgs, hiearchy, max_area=1, min_area=0.0003)
-
-    boxes = []
-
-    for jj in range(len(contours_imgs)):
-        x, y, w, h = cv2.boundingRect(contours_imgs[jj])
-        boxes.append([int(x), int(y), int(w), int(h)])
-    return contours_imgs, boxes
-
-def return_bonding_box_of_contours(cnts):
-    boxes_tot = []
-    for i in range(len(cnts)):
-        x, y, w, h = cv2.boundingRect(cnts[i])
-
-        box = [x, y, w, h]
-        boxes_tot.append(box)
-    return boxes_tot
-
-def find_features_of_contours(contours_main):
-
-    areas_main = np.array([cv2.contourArea(contours_main[j]) for j in range(len(contours_main))])
-    M_main = [cv2.moments(contours_main[j]) for j in range(len(contours_main))]
-    cx_main = [(M_main[j]["m10"] / (M_main[j]["m00"] + 1e-32)) for j in range(len(M_main))]
-    cy_main = [(M_main[j]["m01"] / (M_main[j]["m00"] + 1e-32)) for j in range(len(M_main))]
-    x_min_main = np.array([np.min(contours_main[j][:, 0, 0]) for j in range(len(contours_main))])
-    x_max_main = np.array([np.max(contours_main[j][:, 0, 0]) for j in range(len(contours_main))])
-
-    y_min_main = np.array([np.min(contours_main[j][:, 0, 1]) for j in range(len(contours_main))])
-    y_max_main = np.array([np.max(contours_main[j][:, 0, 1]) for j in range(len(contours_main))])
-
-    return y_min_main, y_max_main, areas_main
-
-def filter_contours_area_of_image_interiors(image, contours, hirarchy, max_area, min_area):
-    found_polygons_early = list()
-
-    jv = 0
-    for c in contours:
-        if len(c) < 3:  # A polygon cannot have less than 3 points
-            continue
-
-        polygon = geometry.Polygon([point[0] for point in c])
-        area = polygon.area
-        if area >= min_area * np.prod(image.shape[:2]) and area <= max_area * np.prod(image.shape[:2]) and hirarchy[0][jv][3] != -1:
-            # print(c[0][0][1])
-            found_polygons_early.append(np.array([point for point in polygon.exterior.coords], dtype=np.uint))
-        jv += 1
-    return found_polygons_early
-
-def return_hor_spliter_by_index_for_without_verticals(peaks_neg_fin_t, x_min_hor_some, x_max_hor_some):
-    # print(peaks_neg_fin_t,x_min_hor_some,x_max_hor_some)
-    arg_min_hor_sort = np.argsort(x_min_hor_some)
-    x_min_hor_some_sort = np.sort(x_min_hor_some)
-    x_max_hor_some_sort = x_max_hor_some[arg_min_hor_sort]
-
-    arg_minmax = np.array(range(len(peaks_neg_fin_t)))
-    indexer_lines = []
-    indexes_to_delete = []
-    indexer_lines_deletions_len = []
-    indexr_uniq_ind = []
-    for i in range(len(x_min_hor_some_sort)):
-        min_h = peaks_neg_fin_t - x_min_hor_some_sort[i]
-
-        max_h = peaks_neg_fin_t - x_max_hor_some_sort[i]
-
-        min_h[0] = min_h[0]  # +20
-        max_h[len(max_h) - 1] = max_h[len(max_h) - 1] - 20
-
-        min_h_neg = arg_minmax[(min_h < 0)]
-        min_h_neg_n = min_h[min_h < 0]
-
-        try:
-            min_h_neg = [min_h_neg[np.argmax(min_h_neg_n)]]
-        except:
-            min_h_neg = []
-
-        max_h_neg = arg_minmax[(max_h > 0)]
-        max_h_neg_n = max_h[max_h > 0]
-
-        if len(max_h_neg_n) > 0:
-            max_h_neg = [max_h_neg[np.argmin(max_h_neg_n)]]
-        else:
-            max_h_neg = []
-
-        if len(min_h_neg) > 0 and len(max_h_neg) > 0:
-            deletions = list(range(min_h_neg[0] + 1, max_h_neg[0]))
-            unique_delets_int = []
-            # print(deletions,len(deletions),'delii')
-            if len(deletions) > 0:
-
-                for j in range(len(deletions)):
-                    indexes_to_delete.append(deletions[j])
-                    # print(deletions,indexes_to_delete,'badiii')
-                    unique_delets = np.unique(indexes_to_delete)
-                    # print(min_h_neg[0],unique_delets)
-                    unique_delets_int = unique_delets[unique_delets < min_h_neg[0]]
-
-                indexer_lines_deletions_len.append(len(deletions))
-                indexr_uniq_ind.append([deletions])
-
-            else:
-                indexer_lines_deletions_len.append(0)
-                indexr_uniq_ind.append(-999)
-
-            index_line_true = min_h_neg[0] - len(unique_delets_int)
-            # print(index_line_true)
-            if index_line_true > 0 and min_h_neg[0] >= 2:
-                index_line_true = index_line_true
-            else:
-                index_line_true = min_h_neg[0]
-
-            indexer_lines.append(index_line_true)
-
-            if len(unique_delets_int) > 0:
-                for dd in range(len(unique_delets_int)):
-                    indexes_to_delete.append(unique_delets_int[dd])
-        else:
-            indexer_lines.append(-999)
-            indexer_lines_deletions_len.append(-999)
-            indexr_uniq_ind.append(-999)
-
-    peaks_true = []
-    for m in range(len(peaks_neg_fin_t)):
-        if m in indexes_to_delete:
-            pass
-        else:
-            peaks_true.append(peaks_neg_fin_t[m])
-    return indexer_lines, peaks_true, arg_min_hor_sort, indexer_lines_deletions_len, indexr_uniq_ind
-
-def otsu_copy(img):
-    img_r = np.zeros(img.shape)
-    img1 = img[:, :, 0]
-    img2 = img[:, :, 1]
-    img3 = img[:, :, 2]
-    # print(img.min())
-    # print(img[:,:,0].min())
-    # blur = cv2.GaussianBlur(img,(5,5))
-    # ret3,th3 = cv2.threshold(blur,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
-    retval1, threshold1 = cv2.threshold(img1, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
-    retval2, threshold2 = cv2.threshold(img2, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
-    retval3, threshold3 = cv2.threshold(img3, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
-
-    img_r[:, :, 0] = threshold1
-    img_r[:, :, 1] = threshold1
-    img_r[:, :, 2] = threshold1
-    return img_r
-
-def return_hor_spliter_by_index(peaks_neg_fin_t, x_min_hor_some, x_max_hor_some):
-
-    arg_min_hor_sort = np.argsort(x_min_hor_some)
-    x_min_hor_some_sort = np.sort(x_min_hor_some)
-    x_max_hor_some_sort = x_max_hor_some[arg_min_hor_sort]
-
-    arg_minmax = np.array(range(len(peaks_neg_fin_t)))
-    indexer_lines = []
-    indexes_to_delete = []
-    indexer_lines_deletions_len = []
-    indexr_uniq_ind = []
-    for i in range(len(x_min_hor_some_sort)):
-        min_h = peaks_neg_fin_t - x_min_hor_some_sort[i]
-        max_h = peaks_neg_fin_t - x_max_hor_some_sort[i]
-
-        min_h[0] = min_h[0]  # +20
-        max_h[len(max_h) - 1] = max_h[len(max_h) - 1]  ##-20
-
-        min_h_neg = arg_minmax[(min_h < 0) & (np.abs(min_h) < 360)]
-        max_h_neg = arg_minmax[(max_h >= 0) & (np.abs(max_h) < 360)]
-
-        if len(min_h_neg) > 0 and len(max_h_neg) > 0:
-            deletions = list(range(min_h_neg[0] + 1, max_h_neg[0]))
-            unique_delets_int = []
-            # print(deletions,len(deletions),'delii')
-            if len(deletions) > 0:
-                # print(deletions,len(deletions),'delii2')
-
-                for j in range(len(deletions)):
-                    indexes_to_delete.append(deletions[j])
-                    # print(deletions,indexes_to_delete,'badiii')
-                    unique_delets = np.unique(indexes_to_delete)
-                    # print(min_h_neg[0],unique_delets)
-                    unique_delets_int = unique_delets[unique_delets < min_h_neg[0]]
-
-                indexer_lines_deletions_len.append(len(deletions))
-                indexr_uniq_ind.append([deletions])
-
-            else:
-                indexer_lines_deletions_len.append(0)
-                indexr_uniq_ind.append(-999)
-
-            index_line_true = min_h_neg[0] - len(unique_delets_int)
-            # print(index_line_true)
-            if index_line_true > 0 and min_h_neg[0] >= 2:
-                index_line_true = index_line_true
-            else:
-                index_line_true = min_h_neg[0]
-
-            indexer_lines.append(index_line_true)
-
-            if len(unique_delets_int) > 0:
-                for dd in range(len(unique_delets_int)):
-                    indexes_to_delete.append(unique_delets_int[dd])
-        else:
-            indexer_lines.append(-999)
-            indexer_lines_deletions_len.append(-999)
-            indexr_uniq_ind.append(-999)
-
-    peaks_true = []
-    for m in range(len(peaks_neg_fin_t)):
-        if m in indexes_to_delete:
-            pass
-        else:
-            peaks_true.append(peaks_neg_fin_t[m])
-    return indexer_lines, peaks_true, arg_min_hor_sort, indexer_lines_deletions_len, indexr_uniq_ind
-
-def implent_law_head_main_not_parallel(text_regions):
-    # print(text_regions.shape)
-    text_indexes = [1, 2]  # 1: main text , 2: header , 3: comments
-
-    for t_i in text_indexes:
-        textline_mask = text_regions[:, :] == t_i
-        textline_mask = textline_mask * 255.0
-
-        textline_mask = textline_mask.astype(np.uint8)
-        textline_mask = np.repeat(textline_mask[:, :, np.newaxis], 3, axis=2)
-        kernel = np.ones((5, 5), np.uint8)
-
-        # print(type(textline_mask),np.unique(textline_mask),textline_mask.shape)
-        imgray = cv2.cvtColor(textline_mask, cv2.COLOR_BGR2GRAY)
-        ret, thresh = cv2.threshold(imgray, 0, 255, 0)
-
-        if t_i == 1:
-            contours_main, hirarchy = cv2.findContours(thresh.copy(), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
-            # print(type(contours_main))
-            areas_main = np.array([cv2.contourArea(contours_main[j]) for j in range(len(contours_main))])
-            M_main = [cv2.moments(contours_main[j]) for j in range(len(contours_main))]
-            cx_main = [(M_main[j]["m10"] / (M_main[j]["m00"] + 1e-32)) for j in range(len(M_main))]
-            cy_main = [(M_main[j]["m01"] / (M_main[j]["m00"] + 1e-32)) for j in range(len(M_main))]
-            x_min_main = np.array([np.min(contours_main[j][:, 0, 0]) for j in range(len(contours_main))])
-            x_max_main = np.array([np.max(contours_main[j][:, 0, 0]) for j in range(len(contours_main))])
-
-            y_min_main = np.array([np.min(contours_main[j][:, 0, 1]) for j in range(len(contours_main))])
-            y_max_main = np.array([np.max(contours_main[j][:, 0, 1]) for j in range(len(contours_main))])
-            # print(contours_main[0],np.shape(contours_main[0]),contours_main[0][:,0,0])
-        elif t_i == 2:
-            contours_header, hirarchy = cv2.findContours(thresh.copy(), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
-            # print(type(contours_header))
-            areas_header = np.array([cv2.contourArea(contours_header[j]) for j in range(len(contours_header))])
-            M_header = [cv2.moments(contours_header[j]) for j in range(len(contours_header))]
-            cx_header = [(M_header[j]["m10"] / (M_header[j]["m00"] + 1e-32)) for j in range(len(M_header))]
-            cy_header = [(M_header[j]["m01"] / (M_header[j]["m00"] + 1e-32)) for j in range(len(M_header))]
-
-            x_min_header = np.array([np.min(contours_header[j][:, 0, 0]) for j in range(len(contours_header))])
-            x_max_header = np.array([np.max(contours_header[j][:, 0, 0]) for j in range(len(contours_header))])
-
-            y_min_header = np.array([np.min(contours_header[j][:, 0, 1]) for j in range(len(contours_header))])
-            y_max_header = np.array([np.max(contours_header[j][:, 0, 1]) for j in range(len(contours_header))])
-
-    args = np.array(range(1, len(cy_header) + 1))
-    args_main = np.array(range(1, len(cy_main) + 1))
-    for jj in range(len(contours_main)):
-        headers_in_main = [(cy_header > y_min_main[jj]) & ((cy_header < y_max_main[jj]))]
-        mains_in_main = [(cy_main > y_min_main[jj]) & ((cy_main < y_max_main[jj]))]
-        args_log = args * headers_in_main
-        res = args_log[args_log > 0]
-        res_true = res - 1
-
-        args_log_main = args_main * mains_in_main
-        res_main = args_log_main[args_log_main > 0]
-        res_true_main = res_main - 1
-
-        if len(res_true) > 0:
-            sum_header = np.sum(areas_header[res_true])
-            sum_main = np.sum(areas_main[res_true_main])
-            if sum_main > sum_header:
-                cnt_int = [contours_header[j] for j in res_true]
-                text_regions = cv2.fillPoly(text_regions, pts=cnt_int, color=(1, 1, 1))
-            else:
-                cnt_int = [contours_main[j] for j in res_true_main]
-                text_regions = cv2.fillPoly(text_regions, pts=cnt_int, color=(2, 2, 2))
-
-    for jj in range(len(contours_header)):
-        main_in_header = [(cy_main > y_min_header[jj]) & ((cy_main < y_max_header[jj]))]
-        header_in_header = [(cy_header > y_min_header[jj]) & ((cy_header < y_max_header[jj]))]
-        args_log = args_main * main_in_header
-        res = args_log[args_log > 0]
-        res_true = res - 1
-
-        args_log_header = args * header_in_header
-        res_header = args_log_header[args_log_header > 0]
-        res_true_header = res_header - 1
-
-        if len(res_true) > 0:
-
-            sum_header = np.sum(areas_header[res_true_header])
-            sum_main = np.sum(areas_main[res_true])
-
-            if sum_main > sum_header:
-
-                cnt_int = [contours_header[j] for j in res_true_header]
-                text_regions = cv2.fillPoly(text_regions, pts=cnt_int, color=(1, 1, 1))
-            else:
-                cnt_int = [contours_main[j] for j in res_true]
-                text_regions = cv2.fillPoly(text_regions, pts=cnt_int, color=(2, 2, 2))
-
-    return text_regions
-
-def delete_seperator_around(spliter_y, peaks_neg, image_by_region):
-    # format of subboxes box=[x1, x2 , y1, y2]
-
-    if len(image_by_region.shape) == 3:
-        for i in range(len(spliter_y) - 1):
-            for j in range(1, len(peaks_neg[i]) - 1):
-                image_by_region[int(spliter_y[i]) : int(spliter_y[i + 1]), peaks_neg[i][j] - int(1.0 / 20.0 * peaks_neg[i][j]) : peaks_neg[i][j] + int(1.0 / 20.0 * peaks_neg[i][j]), 0][image_by_region[int(spliter_y[i]) : int(spliter_y[i + 1]), peaks_neg[i][j] - int(1.0 / 20.0 * peaks_neg[i][j]) : peaks_neg[i][j] + int(1.0 / 20.0 * peaks_neg[i][j]), 0] == 6] = 0
-                image_by_region[spliter_y[i] : spliter_y[i + 1], peaks_neg[i][j] - int(1.0 / 20.0 * peaks_neg[i][j]) : peaks_neg[i][j] + int(1.0 / 20.0 * peaks_neg[i][j]), 0][image_by_region[int(spliter_y[i]) : int(spliter_y[i + 1]), peaks_neg[i][j] - int(1.0 / 20.0 * peaks_neg[i][j]) : peaks_neg[i][j] + int(1.0 / 20.0 * peaks_neg[i][j]), 1] == 6] = 0
-                image_by_region[spliter_y[i] : spliter_y[i + 1], peaks_neg[i][j] - int(1.0 / 20.0 * peaks_neg[i][j]) : peaks_neg[i][j] + int(1.0 / 20.0 * peaks_neg[i][j]), 0][image_by_region[int(spliter_y[i]) : int(spliter_y[i + 1]), peaks_neg[i][j] - int(1.0 / 20.0 * peaks_neg[i][j]) : peaks_neg[i][j] + int(1.0 / 20.0 * peaks_neg[i][j]), 2] == 6] = 0
-
-                image_by_region[int(spliter_y[i]) : int(spliter_y[i + 1]), peaks_neg[i][j] - int(1.0 / 20.0 * peaks_neg[i][j]) : peaks_neg[i][j] + int(1.0 / 20.0 * peaks_neg[i][j]), 0][image_by_region[int(spliter_y[i]) : int(spliter_y[i + 1]), peaks_neg[i][j] - int(1.0 / 20.0 * peaks_neg[i][j]) : peaks_neg[i][j] + int(1.0 / 20.0 * peaks_neg[i][j]), 0] == 7] = 0
-                image_by_region[int(spliter_y[i]) : int(spliter_y[i + 1]), peaks_neg[i][j] - int(1.0 / 20.0 * peaks_neg[i][j]) : peaks_neg[i][j] + int(1.0 / 20.0 * peaks_neg[i][j]), 0][image_by_region[int(spliter_y[i]) : int(spliter_y[i + 1]), peaks_neg[i][j] - int(1.0 / 20.0 * peaks_neg[i][j]) : peaks_neg[i][j] + int(1.0 / 20.0 * peaks_neg[i][j]), 1] == 7] = 0
-                image_by_region[int(spliter_y[i]) : int(spliter_y[i + 1]), peaks_neg[i][j] - int(1.0 / 20.0 * peaks_neg[i][j]) : peaks_neg[i][j] + int(1.0 / 20.0 * peaks_neg[i][j]), 0][image_by_region[int(spliter_y[i]) : int(spliter_y[i + 1]), peaks_neg[i][j] - int(1.0 / 20.0 * peaks_neg[i][j]) : peaks_neg[i][j] + int(1.0 / 20.0 * peaks_neg[i][j]), 2] == 7] = 0
-    else:
-        for i in range(len(spliter_y) - 1):
-            for j in range(1, len(peaks_neg[i]) - 1):
-                image_by_region[int(spliter_y[i]) : int(spliter_y[i + 1]), peaks_neg[i][j] - int(1.0 / 20.0 * peaks_neg[i][j]) : peaks_neg[i][j] + int(1.0 / 20.0 * peaks_neg[i][j])][image_by_region[int(spliter_y[i]) : int(spliter_y[i + 1]), peaks_neg[i][j] - int(1.0 / 20.0 * peaks_neg[i][j]) : peaks_neg[i][j] + int(1.0 / 20.0 * peaks_neg[i][j])] == 6] = 0
-
-                image_by_region[int(spliter_y[i]) : int(spliter_y[i + 1]), peaks_neg[i][j] - int(1.0 / 20.0 * peaks_neg[i][j]) : peaks_neg[i][j] + int(1.0 / 20.0 * peaks_neg[i][j])][image_by_region[int(spliter_y[i]) : int(spliter_y[i + 1]), peaks_neg[i][j] - int(1.0 / 20.0 * peaks_neg[i][j]) : peaks_neg[i][j] + int(1.0 / 20.0 * peaks_neg[i][j])] == 7] = 0
-    return image_by_region
-

From c25fcd81378be073db390ab7c41db83b22742d52 Mon Sep 17 00:00:00 2001
From: vahid <vahid@linux-1y11.fritz.box>
Date: Sat, 27 Feb 2021 09:01:31 -0500
Subject: [PATCH 86/89] check pc

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index 9fd7bd8..242649a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
+#check first update from repaired pc
 # ocrd includes opencv, numpy, shapely, click
 ocrd >= 2.20.3
 keras >= 2.3.1, < 2.4

From 3bda2f128e8851496e93970c5087bbf0afcb8b7a Mon Sep 17 00:00:00 2001
From: vahid <vahid@linux-1y11.fritz.box>
Date: Sat, 27 Feb 2021 09:59:10 -0500
Subject: [PATCH 87/89] all options checked, failures are fixed and all tests
 passed

---
 requirements.txt                          |   1 -
 sbb_newspapers_org_image/eynollah.py      |  38 ++--
 sbb_newspapers_org_image/plot.py          | 200 ++++++++++++----------
 sbb_newspapers_org_image/utils/contour.py |   1 -
 sbb_newspapers_org_image/utils/xml.py     |   1 +
 sbb_newspapers_org_image/writer.py        |  31 ++--
 6 files changed, 143 insertions(+), 129 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 242649a..9fd7bd8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,3 @@
-#check first update from repaired pc
 # ocrd includes opencv, numpy, shapely, click
 ocrd >= 2.20.3
 keras >= 2.3.1, < 2.4
diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index 21747b4..4bd0c56 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -28,6 +28,7 @@ import tensorflow as tf
 tf.get_logger().setLevel("ERROR")
 warnings.filterwarnings("ignore")
 
+
 from .utils.contour import (
     filter_contours_area_of_image,
     find_contours_mean_y_diff,
@@ -110,7 +111,7 @@ class Eynollah:
             dir_of_cropped_images=dir_of_cropped_images,
             dir_of_layout=dir_of_layout,
             image_filename=image_filename,
-            image_filename_stem=image_filename_stem)
+            image_filename_stem=self.image_filename_stem)
         self.writer = EynollahXmlWriter(
             dir_out=self.dir_out,
             image_filename=self.image_filename,
@@ -336,7 +337,10 @@ class Eynollah:
 
     def resize_and_enhance_image_with_column_classifier(self):
         self.logger.debug("enter resize_and_enhance_image_with_column_classifier")
-        dpi = check_dpi(self.image_filename)
+        try:
+            dpi = check_dpi(self.image_filename)
+        except:
+            dpi = 230
         self.logger.info("Detected %s DPI", dpi)
         img = self.imread()
 
@@ -705,13 +709,12 @@ class Eynollah:
 
     def get_slopes_and_deskew_new(self, contours, contours_par, textline_mask_tot, image_page_rotated, boxes, slope_deskew):
         self.logger.debug("enter get_slopes_and_deskew_new")
-        num_cores = cpu_count()
+        num_cores = 1#cpu_count()
         queue_of_all_params = Queue()
 
         processes = []
         nh = np.linspace(0, len(boxes), num_cores + 1)
         indexes_by_text_con = np.array(range(len(contours_par)))
-
         for i in range(num_cores):
             boxes_per_process = boxes[int(nh[i]) : int(nh[i + 1])]
             contours_per_process = contours[int(nh[i]) : int(nh[i + 1])]
@@ -719,7 +722,6 @@ class Eynollah:
             indexes_text_con_per_process = indexes_by_text_con[int(nh[i]) : int(nh[i + 1])]
 
             processes.append(Process(target=self.do_work_of_slopes_new, args=(queue_of_all_params, boxes_per_process, textline_mask_tot, contours_per_process, contours_par_per_process, indexes_text_con_per_process, image_page_rotated, slope_deskew)))
-
         for i in range(num_cores):
             processes[i].start()
 
@@ -730,7 +732,6 @@ class Eynollah:
         boxes = []
         all_box_coord = []
         all_index_text_con = []
-
         for i in range(num_cores):
             list_all_par = queue_of_all_params.get(True)
             slopes_for_sub_process = list_all_par[0]
@@ -748,7 +749,6 @@ class Eynollah:
                 all_found_text_regions_par.append(contours_par_for_subprocess[j])
                 all_box_coord.append(boxes_coord_for_subprocess[j])
                 all_index_text_con.append(indexes_for_subprocess[j])
-
         for i in range(num_cores):
             processes[i].join()
         self.logger.debug('slopes %s', slopes)
@@ -918,7 +918,6 @@ class Eynollah:
 
     def do_work_of_slopes_new(self, queue_of_all_params, boxes_text, textline_mask_tot_ea, contours_per_process, contours_par_per_process, indexes_r_con_per_pro, image_page_rotated, slope_deskew):
         self.logger.debug('enter do_work_of_slopes_new')
-
         slopes_per_each_subprocess = []
         bounding_box_of_textregion_per_each_subprocess = []
         textlines_rectangles_per_each_subprocess = []
@@ -926,7 +925,6 @@ class Eynollah:
         contours_textregion_par_per_each_subprocess = []
         all_box_coord_per_process = []
         index_by_text_region_contours = []
-
         for mv in range(len(boxes_text)):
             _, crop_coor = crop_image_inside_box(boxes_text[mv],image_page_rotated)
             mask_textline = np.zeros((textline_mask_tot_ea.shape))
@@ -959,7 +957,6 @@ class Eynollah:
                 except Exception as why:
                     self.logger.error(why)
                     slope_for_all = MAX_SLOPE
-
                 if slope_for_all == MAX_SLOPE:
                     slope_for_all = [slope_deskew][0]
                 slopes_per_each_subprocess.append(slope_for_all)
@@ -988,7 +985,6 @@ class Eynollah:
             contours_textregion_per_each_subprocess.append(contours_per_process[mv])
             contours_textregion_par_per_each_subprocess.append(contours_par_per_process[mv])
             all_box_coord_per_process.append(crop_coor)
-
         queue_of_all_params.put([slopes_per_each_subprocess, textlines_rectangles_per_each_subprocess, bounding_box_of_textregion_per_each_subprocess, contours_textregion_per_each_subprocess, contours_textregion_par_per_each_subprocess, all_box_coord_per_process, index_by_text_region_contours])
 
     def textline_contours(self, img, patches, scaler_h, scaler_w):
@@ -1596,6 +1592,9 @@ class Eynollah:
 
         t0 = time.time()
         img_res, is_image_enhanced, num_col_classifier, num_column_is_classified = self.run_enhancement()
+        
+        
+        
         self.logger.info("Enhancing took %ss ", str(time.time() - t0))
 
         t1 = time.time()
@@ -1633,11 +1632,9 @@ class Eynollah:
         pixel_img = 4
         min_area_mar = 0.00001
         polygons_of_marginals = return_contours_of_interested_region(text_regions_p, pixel_img, min_area_mar)
-
+        
         if self.full_layout:
             polygons_of_images, img_revised_tab, text_regions_p_1_n, textline_mask_tot_d, regions_without_seperators_d, regions_fully, regions_without_seperators = self.run_boxes_full_layout(image_page, textline_mask_tot, text_regions_p, slope_deskew, num_col_classifier, img_only_regions)
-        # plt.imshow(img_revised_tab)
-        # plt.show()
 
         text_only = ((img_revised_tab[:, :] == 1)) * 1
         if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
@@ -1723,25 +1720,22 @@ class Eynollah:
             self.logger.debug('areas_cnt_text_parent %s', areas_cnt_text_parent)
             # self.logger.debug('areas_cnt_text_parent_d %s', areas_cnt_text_parent_d)
             # self.logger.debug('len(contours_only_text_parent) %s', len(contours_only_text_parent_d))
-
         txt_con_org = get_textregion_contours_in_org_image(contours_only_text_parent, self.image, slope_first)
         boxes_text, _ = get_text_region_boxes_by_given_contours(contours_only_text_parent)
         boxes_marginals, _ = get_text_region_boxes_by_given_contours(polygons_of_marginals)
 
         if not self.curved_line:
             slopes, all_found_texline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con = self.get_slopes_and_deskew_new(txt_con_org, contours_only_text_parent, textline_mask_tot_ea, image_page_rotated, boxes_text, slope_deskew)
-            _, all_found_texline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, _ = self.get_slopes_and_deskew_new(polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea, image_page_rotated, boxes_marginals, slope_deskew)
+            slopes_marginals, all_found_texline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, _ = self.get_slopes_and_deskew_new(polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea, image_page_rotated, boxes_marginals, slope_deskew)
 
         else:
+            
             scale_param = 1
             all_found_texline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con, slopes = self.get_slopes_and_deskew_new_curved(txt_con_org, contours_only_text_parent, cv2.erode(textline_mask_tot_ea, kernel=KERNEL, iterations=1), image_page_rotated, boxes_text, text_only, num_col_classifier, scale_param, slope_deskew)
             all_found_texline_polygons = small_textlines_to_parent_adherence2(all_found_texline_polygons, textline_mask_tot_ea, num_col_classifier)
-            all_found_texline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, _, _ = self.get_slopes_and_deskew_new_curved(polygons_of_marginals, polygons_of_marginals, cv2.erode(textline_mask_tot_ea, kernel=KERNEL, iterations=1), image_page_rotated, boxes_marginals, text_only, num_col_classifier, scale_param, slope_deskew)
+            all_found_texline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, _, slopes_marginals = self.get_slopes_and_deskew_new_curved(polygons_of_marginals, polygons_of_marginals, cv2.erode(textline_mask_tot_ea, kernel=KERNEL, iterations=1), image_page_rotated, boxes_marginals, text_only, num_col_classifier, scale_param, slope_deskew)
             all_found_texline_polygons_marginals = small_textlines_to_parent_adherence2(all_found_texline_polygons_marginals, textline_mask_tot_ea, num_col_classifier)
-
         K.clear_session()
-        # print(index_by_text_par_con,'index_by_text_par_con')
-
         if self.full_layout:
             if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
                 contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered)[index_by_text_par_con])
@@ -1809,7 +1803,7 @@ class Eynollah:
             else:
                 order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered, boxes_d, textline_mask_tot_d)
 
-            pcgts = self.writer.build_pagexml_full_layout(contours_only_text_parent, contours_only_text_parent_h, page_coord, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, polygons_of_images, polygons_of_tabels, polygons_of_drop_capitals, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, cont_page)
+            pcgts = self.writer.build_pagexml_full_layout(contours_only_text_parent, contours_only_text_parent_h, page_coord, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, polygons_of_images, polygons_of_tabels, polygons_of_drop_capitals, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, cont_page)
             self.logger.info("Job done in %ss", str(time.time() - t0))
             return pcgts
         else:
@@ -1819,6 +1813,6 @@ class Eynollah:
             else:
                 contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered)[index_by_text_par_con])
                 order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h, boxes_d, textline_mask_tot_d)
-            pcgts = self.writer.build_pagexml_no_full_layout(txt_con_org, page_coord, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, cont_page)
+            pcgts = self.writer.build_pagexml_no_full_layout(txt_con_org, page_coord, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, cont_page)
             self.logger.info("Job done in %ss", str(time.time() - t0))
             return pcgts
diff --git a/sbb_newspapers_org_image/plot.py b/sbb_newspapers_org_image/plot.py
index cba8b58..4dee928 100644
--- a/sbb_newspapers_org_image/plot.py
+++ b/sbb_newspapers_org_image/plot.py
@@ -39,121 +39,131 @@ class EynollahPlotter():
         self.scale_y = scale_y
 
     def save_plot_of_layout_main(self, text_regions_p, image_page):
-        values = np.unique(text_regions_p[:, :])
-        # pixels=['Background' , 'Main text' , 'Heading' , 'Marginalia' ,'Drop capitals' , 'Images' , 'Seperators' , 'Tables', 'Graphics']
-        pixels=['Background' , 'Main text'  , 'Image' , 'Separator','Marginalia']
-        values_indexes = [0, 1, 2, 3, 4]
-        plt.figure(figsize=(40, 40))
-        plt.rcParams["font.size"] = "40"
-        im = plt.imshow(text_regions_p[:, :])
-        colors = [im.cmap(im.norm(value)) for value in values]
-        patches = [mpatches.Patch(color=colors[np.where(values == i)[0][0]], label="{l}".format(l=pixels[int(np.where(values_indexes == i)[0][0])])) for i in values]
-        plt.legend(handles=patches, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0, fontsize=40)
-        plt.savefig(os.path.join(self.dir_of_layout, self.image_filename_stem + "_layout_main.png"))
+        if self.dir_of_layout is not None:
+            values = np.unique(text_regions_p[:, :])
+            # pixels=['Background' , 'Main text' , 'Heading' , 'Marginalia' ,'Drop capitals' , 'Images' , 'Seperators' , 'Tables', 'Graphics']
+            pixels=['Background' , 'Main text'  , 'Image' , 'Separator','Marginalia']
+            values_indexes = [0, 1, 2, 3, 4]
+            plt.figure(figsize=(40, 40))
+            plt.rcParams["font.size"] = "40"
+            im = plt.imshow(text_regions_p[:, :])
+            colors = [im.cmap(im.norm(value)) for value in values]
+            patches = [mpatches.Patch(color=colors[np.where(values == i)[0][0]], label="{l}".format(l=pixels[int(np.where(values_indexes == i)[0][0])])) for i in values]
+            plt.legend(handles=patches, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0, fontsize=40)
+            plt.savefig(os.path.join(self.dir_of_layout, self.image_filename_stem + "_layout_main.png"))
+        
 
     def save_plot_of_layout_main_all(self, text_regions_p, image_page):
-        values = np.unique(text_regions_p[:, :])
-        # pixels=['Background' , 'Main text' , 'Heading' , 'Marginalia' ,'Drop capitals' , 'Images' , 'Seperators' , 'Tables', 'Graphics']
-        pixels=['Background' , 'Main text'  , 'Image' , 'Separator','Marginalia']
-        values_indexes = [0, 1, 2, 3, 4]
-        plt.figure(figsize=(80, 40))
-        plt.rcParams["font.size"] = "40"
-        plt.subplot(1, 2, 1)
-        plt.imshow(image_page)
-        plt.subplot(1, 2, 2)
-        im = plt.imshow(text_regions_p[:, :])
-        colors = [im.cmap(im.norm(value)) for value in values]
-        patches = [mpatches.Patch(color=colors[np.where(values == i)[0][0]], label="{l}".format(l=pixels[int(np.where(values_indexes == i)[0][0])])) for i in values]
-        plt.legend(handles=patches, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0, fontsize=60)
-        plt.savefig(os.path.join(self.dir_of_all, self.image_filename_stem + "_layout_main_and_page.png"))
+        if self.dir_of_all is not None:
+            values = np.unique(text_regions_p[:, :])
+            # pixels=['Background' , 'Main text' , 'Heading' , 'Marginalia' ,'Drop capitals' , 'Images' , 'Seperators' , 'Tables', 'Graphics']
+            pixels=['Background' , 'Main text'  , 'Image' , 'Separator','Marginalia']
+            values_indexes = [0, 1, 2, 3, 4]
+            plt.figure(figsize=(80, 40))
+            plt.rcParams["font.size"] = "40"
+            plt.subplot(1, 2, 1)
+            plt.imshow(image_page)
+            plt.subplot(1, 2, 2)
+            im = plt.imshow(text_regions_p[:, :])
+            colors = [im.cmap(im.norm(value)) for value in values]
+            patches = [mpatches.Patch(color=colors[np.where(values == i)[0][0]], label="{l}".format(l=pixels[int(np.where(values_indexes == i)[0][0])])) for i in values]
+            plt.legend(handles=patches, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0, fontsize=60)
+            plt.savefig(os.path.join(self.dir_of_all, self.image_filename_stem + "_layout_main_and_page.png"))
 
     def save_plot_of_layout(self, text_regions_p, image_page):
-        values = np.unique(text_regions_p[:, :])
-        # pixels=['Background' , 'Main text' , 'Heading' , 'Marginalia' ,'Drop capitals' , 'Images' , 'Seperators' , 'Tables', 'Graphics']
-        pixels = ["Background", "Main text", "Header", "Marginalia", "Drop capital", "Image", "Separator"]
-        values_indexes = [0, 1, 2, 8, 4, 5, 6]
-        plt.figure(figsize=(40, 40))
-        plt.rcParams["font.size"] = "40"
-        im = plt.imshow(text_regions_p[:, :])
-        colors = [im.cmap(im.norm(value)) for value in values]
-        patches = [mpatches.Patch(color=colors[np.where(values == i)[0][0]], label="{l}".format(l=pixels[int(np.where(values_indexes == i)[0][0])])) for i in values]
-        plt.legend(handles=patches, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0, fontsize=40)
-        plt.savefig(os.path.join(self.dir_of_layout, self.image_filename_stem + "_layout.png"))
+        if self.dir_of_layout is not None:
+            values = np.unique(text_regions_p[:, :])
+            # pixels=['Background' , 'Main text' , 'Heading' , 'Marginalia' ,'Drop capitals' , 'Images' , 'Seperators' , 'Tables', 'Graphics']
+            pixels = ["Background", "Main text", "Header", "Marginalia", "Drop capital", "Image", "Separator"]
+            values_indexes = [0, 1, 2, 8, 4, 5, 6]
+            plt.figure(figsize=(40, 40))
+            plt.rcParams["font.size"] = "40"
+            im = plt.imshow(text_regions_p[:, :])
+            colors = [im.cmap(im.norm(value)) for value in values]
+            patches = [mpatches.Patch(color=colors[np.where(values == i)[0][0]], label="{l}".format(l=pixels[int(np.where(values_indexes == i)[0][0])])) for i in values]
+            plt.legend(handles=patches, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0, fontsize=40)
+            plt.savefig(os.path.join(self.dir_of_layout, self.image_filename_stem + "_layout.png"))
 
     def save_plot_of_layout_all(self, text_regions_p, image_page):
-        values = np.unique(text_regions_p[:, :])
-        # pixels=['Background' , 'Main text' , 'Heading' , 'Marginalia' ,'Drop capitals' , 'Images' , 'Seperators' , 'Tables', 'Graphics']
-        pixels = ["Background", "Main text", "Header", "Marginalia", "Drop capital", "Image", "Separator"]
-        values_indexes = [0, 1, 2, 8, 4, 5, 6]
-        plt.figure(figsize=(80, 40))
-        plt.rcParams["font.size"] = "40"
-        plt.subplot(1, 2, 1)
-        plt.imshow(image_page)
-        plt.subplot(1, 2, 2)
-        im = plt.imshow(text_regions_p[:, :])
-        colors = [im.cmap(im.norm(value)) for value in values]
-        patches = [mpatches.Patch(color=colors[np.where(values == i)[0][0]], label="{l}".format(l=pixels[int(np.where(values_indexes == i)[0][0])])) for i in values]
-        plt.legend(handles=patches, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0, fontsize=60)
-        plt.savefig(os.path.join(self.dir_of_all, self.image_filename_stem + "_layout_and_page.png"))
+        if self.dir_of_all is not None:
+            values = np.unique(text_regions_p[:, :])
+            # pixels=['Background' , 'Main text' , 'Heading' , 'Marginalia' ,'Drop capitals' , 'Images' , 'Seperators' , 'Tables', 'Graphics']
+            pixels = ["Background", "Main text", "Header", "Marginalia", "Drop capital", "Image", "Separator"]
+            values_indexes = [0, 1, 2, 8, 4, 5, 6]
+            plt.figure(figsize=(80, 40))
+            plt.rcParams["font.size"] = "40"
+            plt.subplot(1, 2, 1)
+            plt.imshow(image_page)
+            plt.subplot(1, 2, 2)
+            im = plt.imshow(text_regions_p[:, :])
+            colors = [im.cmap(im.norm(value)) for value in values]
+            patches = [mpatches.Patch(color=colors[np.where(values == i)[0][0]], label="{l}".format(l=pixels[int(np.where(values_indexes == i)[0][0])])) for i in values]
+            plt.legend(handles=patches, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0, fontsize=60)
+            plt.savefig(os.path.join(self.dir_of_all, self.image_filename_stem + "_layout_and_page.png"))
 
     def save_plot_of_textlines(self, textline_mask_tot_ea, image_page):
-        values = np.unique(textline_mask_tot_ea[:, :])
-        pixels = ["Background", "Textlines"]
-        values_indexes = [0, 1]
-        plt.figure(figsize=(80, 40))
-        plt.rcParams["font.size"] = "40"
-        plt.subplot(1, 2, 1)
-        plt.imshow(image_page)
-        plt.subplot(1, 2, 2)
-        im = plt.imshow(textline_mask_tot_ea[:, :])
-        colors = [im.cmap(im.norm(value)) for value in values]
-        patches = [mpatches.Patch(color=colors[np.where(values == i)[0][0]], label="{l}".format(l=pixels[int(np.where(values_indexes == i)[0][0])])) for i in values]
-        plt.legend(handles=patches, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0, fontsize=60)
-        plt.savefig(os.path.join(self.dir_of_all, self.image_filename_stem + "_textline_and_page.png"))
+        if self.dir_of_all is not None:
+            values = np.unique(textline_mask_tot_ea[:, :])
+            pixels = ["Background", "Textlines"]
+            values_indexes = [0, 1]
+            plt.figure(figsize=(80, 40))
+            plt.rcParams["font.size"] = "40"
+            plt.subplot(1, 2, 1)
+            plt.imshow(image_page)
+            plt.subplot(1, 2, 2)
+            im = plt.imshow(textline_mask_tot_ea[:, :])
+            colors = [im.cmap(im.norm(value)) for value in values]
+            patches = [mpatches.Patch(color=colors[np.where(values == i)[0][0]], label="{l}".format(l=pixels[int(np.where(values_indexes == i)[0][0])])) for i in values]
+            plt.legend(handles=patches, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0, fontsize=60)
+            plt.savefig(os.path.join(self.dir_of_all, self.image_filename_stem + "_textline_and_page.png"))
 
     def save_deskewed_image(self, slope_deskew):
         if self.dir_of_all is not None:
-            img_rotated = rotyate_image_different(self.image_org, slope_deskew)
             cv2.imwrite(os.path.join(self.dir_of_all, self.image_filename_stem + "_org.png"), self.image_org)
-        cv2.imwrite(os.path.join(self.dir_of_deskewed, self.image_filename_stem + "_deskewed.png"), img_rotated)
+        if self.dir_of_deskewed is not None:
+            img_rotated = rotyate_image_different(self.image_org, slope_deskew)
+            cv2.imwrite(os.path.join(self.dir_of_deskewed, self.image_filename_stem + "_deskewed.png"), img_rotated)
 
     def save_page_image(self, image_page):
-        cv2.imwrite(os.path.join(self.dir_of_all, self.image_filename_stem + "_page.png"), image_page)
+        if self.dir_of_all is not None:
+            cv2.imwrite(os.path.join(self.dir_of_all, self.image_filename_stem + "_page.png"), image_page)
 
     def save_plot_of_textline_density(self, img_patch_org):
-        plt.figure(figsize=(80,40))
-        plt.rcParams['font.size']='50'
-        plt.subplot(1,2,1)
-        plt.imshow(img_patch_org)
-        plt.subplot(1,2,2)
-        plt.plot(gaussian_filter1d(img_patch_org.sum(axis=1), 3),np.array(range(len(gaussian_filter1d(img_patch_org.sum(axis=1), 3)))),linewidth=8)
-        plt.xlabel('Density of textline prediction in direction of X axis',fontsize=60)
-        plt.ylabel('Height',fontsize=60)
-        plt.yticks([0,len(gaussian_filter1d(img_patch_org.sum(axis=1), 3))])
-        plt.gca().invert_yaxis()
-        plt.savefig(os.path.join(self.dir_of_all, self.image_filename_stem+'_density_of_textline.png'))
+        if self.dir_of_all is not None:
+            plt.figure(figsize=(80,40))
+            plt.rcParams['font.size']='50'
+            plt.subplot(1,2,1)
+            plt.imshow(img_patch_org)
+            plt.subplot(1,2,2)
+            plt.plot(gaussian_filter1d(img_patch_org.sum(axis=1), 3),np.array(range(len(gaussian_filter1d(img_patch_org.sum(axis=1), 3)))),linewidth=8)
+            plt.xlabel('Density of textline prediction in direction of X axis',fontsize=60)
+            plt.ylabel('Height',fontsize=60)
+            plt.yticks([0,len(gaussian_filter1d(img_patch_org.sum(axis=1), 3))])
+            plt.gca().invert_yaxis()
+            plt.savefig(os.path.join(self.dir_of_all, self.image_filename_stem+'_density_of_textline.png'))
 
     def save_plot_of_rotation_angle(self, angels, var_res):
-        #print('galdi?')
-        plt.figure(figsize=(60,30))
-        plt.rcParams['font.size']='50'
-        plt.plot(angels,np.array(var_res),'-o',markersize=25,linewidth=4)
-        plt.xlabel('angle',fontsize=50)
-        plt.ylabel('variance of sum of rotated textline in direction of x axis',fontsize=50)
-        plt.plot(angels[np.argmax(var_res)],var_res[np.argmax(np.array(var_res))]  ,'*',markersize=50,label='Angle of deskewing=' +str("{:.2f}".format(angels[np.argmax(var_res)]))+r'$\degree$')
-        plt.legend(loc='best')
-        plt.savefig(os.path.join(self.dir_of_all, self.image_filename_stem+'_rotation_angle.png'))
+        if self.dir_of_all is not None:
+            plt.figure(figsize=(60,30))
+            plt.rcParams['font.size']='50'
+            plt.plot(angels,np.array(var_res),'-o',markersize=25,linewidth=4)
+            plt.xlabel('angle',fontsize=50)
+            plt.ylabel('variance of sum of rotated textline in direction of x axis',fontsize=50)
+            plt.plot(angels[np.argmax(var_res)],var_res[np.argmax(np.array(var_res))]  ,'*',markersize=50,label='Angle of deskewing=' +str("{:.2f}".format(angels[np.argmax(var_res)]))+r'$\degree$')
+            plt.legend(loc='best')
+            plt.savefig(os.path.join(self.dir_of_all, self.image_filename_stem+'_rotation_angle.png'))
 
     def write_images_into_directory(self, img_contoures, image_page):
-        index = 0
-        for cont_ind in img_contoures:
-            x, y, w, h = cv2.boundingRect(cont_ind)
-            box = [x, y, w, h]
-            croped_page, page_coord = crop_image_inside_box(box, image_page)
+        if self.dir_of_cropped_images is not None:
+            index = 0
+            for cont_ind in img_contoures:
+                x, y, w, h = cv2.boundingRect(cont_ind)
+                box = [x, y, w, h]
+                croped_page, page_coord = crop_image_inside_box(box, image_page)
 
-            croped_page = resize_image(croped_page, int(croped_page.shape[0] / self.scale_y), int(croped_page.shape[1] / self.scale_x))
+                croped_page = resize_image(croped_page, int(croped_page.shape[0] / self.scale_y), int(croped_page.shape[1] / self.scale_x))
 
-            path = os.path.join(self.dir_of_cropped_images, self.image_filename_stem + "_" + str(index) + ".jpg")
-            cv2.imwrite(path, croped_page)
-            index += 1
+                path = os.path.join(self.dir_of_cropped_images, self.image_filename_stem + "_" + str(index) + ".jpg")
+                cv2.imwrite(path, croped_page)
+                index += 1
 
diff --git a/sbb_newspapers_org_image/utils/contour.py b/sbb_newspapers_org_image/utils/contour.py
index 06e2ee8..4fe3ae6 100644
--- a/sbb_newspapers_org_image/utils/contour.py
+++ b/sbb_newspapers_org_image/utils/contour.py
@@ -43,7 +43,6 @@ def get_text_region_boxes_by_given_contours(contours):
 
 def filter_contours_area_of_image(image, contours, hirarchy, max_area, min_area):
     found_polygons_early = list()
-
     jv = 0
     for c in contours:
         if len(c) < 3:  # A polygon cannot have less than 3 points
diff --git a/sbb_newspapers_org_image/utils/xml.py b/sbb_newspapers_org_image/utils/xml.py
index bba7db8..fe806e9 100644
--- a/sbb_newspapers_org_image/utils/xml.py
+++ b/sbb_newspapers_org_image/utils/xml.py
@@ -58,4 +58,5 @@ def xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia, found
         name.set('index', str(indexer_region))
         name.set('regionRef', 'r%s' % indexer_region)
         indexer_region += 1
+    return id_of_marginalia
 
diff --git a/sbb_newspapers_org_image/writer.py b/sbb_newspapers_org_image/writer.py
index a949322..c8c34e4 100644
--- a/sbb_newspapers_org_image/writer.py
+++ b/sbb_newspapers_org_image/writer.py
@@ -36,7 +36,7 @@ class EynollahXmlWriter():
             points_page_print = points_page_print + ' '
         return points_page_print[:-1]
 
-    def serialize_lines_in_marginal(self, marginal, all_found_texline_polygons_marginals, marginal_idx, page_coord, all_box_coord_marginals, id_indexer_l):
+    def serialize_lines_in_marginal(self, marginal, all_found_texline_polygons_marginals, marginal_idx, page_coord, all_box_coord_marginals, slopes_marginals, id_indexer_l):
         for j in range(len(all_found_texline_polygons_marginals[marginal_idx])):
             textline = ET.SubElement(marginal, 'TextLine')
             textline.set('id', 'l%s' % id_indexer_l)
@@ -54,7 +54,7 @@ class EynollahXmlWriter():
                         points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x))
                         points_co += ','
                         points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0])/self.scale_y))
-                else:
+                if self.curved_line and np.abs(slopes_marginals[marginal_idx]) <= 45:
                     if len(all_found_texline_polygons_marginals[marginal_idx][j][l]) == 2:
                         points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0] + page_coord[2]) / self.scale_x))
                         points_co += ','
@@ -63,6 +63,17 @@ class EynollahXmlWriter():
                         points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][0] + page_coord[2]) / self.scale_x))
                         points_co += ','
                         points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][1] + page_coord[0]) / self.scale_y))
+                        
+                elif self.curved_line and np.abs(slopes_marginals[marginal_idx]) > 45:
+                    if len(all_found_texline_polygons_marginals[marginal_idx][j][l]) == 2:
+                        points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x))
+                        points_co += ','
+                        points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0]) / self.scale_y))
+                    else:
+                        points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x))
+                        points_co += ','
+                        points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0]) / self.scale_y))
+
                 if l < len(all_found_texline_polygons_marginals[marginal_idx][j]) - 1:
                     points_co += ' '
             coord.set('points',points_co)
@@ -119,7 +130,7 @@ class EynollahXmlWriter():
         tree = ET.ElementTree(pcgts)
         tree.write(os.path.join(self.dir_out, self.image_filename_stem) + ".xml")
 
-    def build_pagexml_no_full_layout(self, found_polygons_text_region, page_coord, order_of_texts, id_of_texts, all_found_texline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, cont_page):
+    def build_pagexml_no_full_layout(self, found_polygons_text_region, page_coord, order_of_texts, id_of_texts, all_found_texline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, cont_page):
         self.logger.debug('enter build_pagexml_no_full_layout')
 
         # create the file structure
@@ -132,7 +143,7 @@ class EynollahXmlWriter():
         id_indexer = 0
         id_indexer_l = 0
         if len(found_polygons_text_region) > 0:
-            xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia, found_polygons_marginals)
+            id_of_marginalia = xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia, found_polygons_marginals)
             for mm in range(len(found_polygons_text_region)):
                 textregion = ET.SubElement(page, 'TextRegion')
                 textregion.set('id', 'r%s' % id_indexer)
@@ -143,13 +154,13 @@ class EynollahXmlWriter():
                 id_indexer_l = self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, id_indexer_l)
                 add_textequiv(textregion)
 
-        for marginal_idx in range(len(found_polygons_marginals)):
+        for mm in range(len(found_polygons_marginals)):
             marginal = ET.SubElement(page, 'TextRegion')
             marginal.set('id', id_of_marginalia[mm])
             marginal.set('type', 'marginalia')
             coord_text = ET.SubElement(marginal, 'Coords')
             coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals, mm, page_coord))
-            self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, marginal_idx, page_coord, all_box_coord_marginals, id_indexer_l)
+            id_indexer_l = self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, id_indexer_l)
 
         id_indexer = len(found_polygons_text_region) + len(found_polygons_marginals)
         for mm in range(len(found_polygons_text_region_img)):
@@ -168,7 +179,7 @@ class EynollahXmlWriter():
 
         return pcgts
 
-    def build_pagexml_full_layout(self, found_polygons_text_region, found_polygons_text_region_h, page_coord, order_of_texts, id_of_texts, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, cont_page):
+    def build_pagexml_full_layout(self, found_polygons_text_region, found_polygons_text_region_h, page_coord, order_of_texts, id_of_texts, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, cont_page):
         self.logger.debug('enter build_pagexml_full_layout')
 
         # create the file structure
@@ -182,7 +193,7 @@ class EynollahXmlWriter():
         id_of_marginalia = []
 
         if len(found_polygons_text_region) > 0:
-            xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia, found_polygons_marginals)
+            id_of_marginalia = xml_reading_order(page, order_of_texts, id_of_texts, id_of_marginalia, found_polygons_marginals)
             for mm in range(len(found_polygons_text_region)):
                 textregion=ET.SubElement(page, 'TextRegion')
                 textregion.set('id', 'r%s' % id_indexer)
@@ -216,14 +227,14 @@ class EynollahXmlWriter():
                 coord_text.set('points', self.calculate_polygon_coords(found_polygons_drop_capitals, mm, page_coord))
                 add_textequiv(textregion)
 
-        for marginal_idx in range(len(found_polygons_marginals)):
+        for mm in range(len(found_polygons_marginals)):
             marginal = ET.SubElement(page, 'TextRegion')
             add_textequiv(textregion)
             marginal.set('id', id_of_marginalia[mm])
             marginal.set('type', 'marginalia')
             coord_text = ET.SubElement(marginal, 'Coords')
             coord_text.set('points', self.calculate_polygon_coords(found_polygons_marginals, mm, page_coord))
-            self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, marginal_idx, page_coord, all_box_coord_marginals, id_indexer_l)
+            id_indexer_l = self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, id_indexer_l)
 
         id_indexer = len(found_polygons_text_region) + len(found_polygons_text_region_h) + len(found_polygons_marginals) + len(found_polygons_drop_capitals)
         for mm in range(len(found_polygons_text_region_img)):

From a96d23712d6450bf8f01b41718fa2a35ac24480c Mon Sep 17 00:00:00 2001
From: vahid <vahid@linux-1y11.fritz.box>
Date: Sat, 27 Feb 2021 10:15:58 -0500
Subject: [PATCH 88/89] use all cpu nodes

---
 sbb_newspapers_org_image/eynollah.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py
index 4bd0c56..9bb95c9 100644
--- a/sbb_newspapers_org_image/eynollah.py
+++ b/sbb_newspapers_org_image/eynollah.py
@@ -709,7 +709,7 @@ class Eynollah:
 
     def get_slopes_and_deskew_new(self, contours, contours_par, textline_mask_tot, image_page_rotated, boxes, slope_deskew):
         self.logger.debug("enter get_slopes_and_deskew_new")
-        num_cores = 1#cpu_count()
+        num_cores = cpu_count()
         queue_of_all_params = Queue()
 
         processes = []

From 58c4403e13e4df2aa7ff0fa236434c41f1d729b4 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Wed, 24 Feb 2021 18:49:05 +0100
Subject: [PATCH 89/89] rename package to qurator.eynollah

---
 {sbb_newspapers_org_image => qurator}/.gitkeep       |  0
 qurator/__init__.py                                  |  1 +
 .../eynollah}/__init__.py                            |  0
 .../eynollah}/cli.py                                 |  2 +-
 .../eynollah}/eynollah.py                            |  0
 .../eynollah}/plot.py                                |  0
 .../eynollah}/utils/__init__.py                      |  0
 .../eynollah}/utils/contour.py                       |  0
 .../eynollah}/utils/drop_capitals.py                 |  0
 .../eynollah}/utils/is_nan.py                        |  0
 .../eynollah}/utils/marginals.py                     |  0
 .../eynollah}/utils/pil_cv2.py                       |  0
 .../eynollah}/utils/resize.py                        |  0
 .../eynollah}/utils/rotate.py                        |  0
 .../eynollah}/utils/separate_lines.py                |  0
 .../eynollah}/utils/xml.py                           |  0
 .../eynollah}/writer.py                              |  0
 setup.py                                             |  7 ++++---
 tests/test_dpi.py                                    |  2 +-
 tests/test_run.py                                    |  2 +-
 tests/test_smoke.py                                  | 12 ++++++------
 tests/test_xml.py                                    |  2 +-
 22 files changed, 15 insertions(+), 13 deletions(-)
 rename {sbb_newspapers_org_image => qurator}/.gitkeep (100%)
 create mode 100644 qurator/__init__.py
 rename {sbb_newspapers_org_image => qurator/eynollah}/__init__.py (100%)
 rename {sbb_newspapers_org_image => qurator/eynollah}/cli.py (98%)
 rename {sbb_newspapers_org_image => qurator/eynollah}/eynollah.py (100%)
 rename {sbb_newspapers_org_image => qurator/eynollah}/plot.py (100%)
 rename {sbb_newspapers_org_image => qurator/eynollah}/utils/__init__.py (100%)
 rename {sbb_newspapers_org_image => qurator/eynollah}/utils/contour.py (100%)
 rename {sbb_newspapers_org_image => qurator/eynollah}/utils/drop_capitals.py (100%)
 rename {sbb_newspapers_org_image => qurator/eynollah}/utils/is_nan.py (100%)
 rename {sbb_newspapers_org_image => qurator/eynollah}/utils/marginals.py (100%)
 rename {sbb_newspapers_org_image => qurator/eynollah}/utils/pil_cv2.py (100%)
 rename {sbb_newspapers_org_image => qurator/eynollah}/utils/resize.py (100%)
 rename {sbb_newspapers_org_image => qurator/eynollah}/utils/rotate.py (100%)
 rename {sbb_newspapers_org_image => qurator/eynollah}/utils/separate_lines.py (100%)
 rename {sbb_newspapers_org_image => qurator/eynollah}/utils/xml.py (100%)
 rename {sbb_newspapers_org_image => qurator/eynollah}/writer.py (100%)

diff --git a/sbb_newspapers_org_image/.gitkeep b/qurator/.gitkeep
similarity index 100%
rename from sbb_newspapers_org_image/.gitkeep
rename to qurator/.gitkeep
diff --git a/qurator/__init__.py b/qurator/__init__.py
new file mode 100644
index 0000000..5284146
--- /dev/null
+++ b/qurator/__init__.py
@@ -0,0 +1 @@
+__import__("pkg_resources").declare_namespace(__name__)
diff --git a/sbb_newspapers_org_image/__init__.py b/qurator/eynollah/__init__.py
similarity index 100%
rename from sbb_newspapers_org_image/__init__.py
rename to qurator/eynollah/__init__.py
diff --git a/sbb_newspapers_org_image/cli.py b/qurator/eynollah/cli.py
similarity index 98%
rename from sbb_newspapers_org_image/cli.py
rename to qurator/eynollah/cli.py
index c18555d..514853a 100644
--- a/sbb_newspapers_org_image/cli.py
+++ b/qurator/eynollah/cli.py
@@ -1,7 +1,7 @@
 import sys
 import click
 from ocrd_utils import initLogging, setOverrideLogLevel
-from sbb_newspapers_org_image.eynollah import Eynollah
+from qurator.eynollah.eynollah import Eynollah
 
 
 @click.command()
diff --git a/sbb_newspapers_org_image/eynollah.py b/qurator/eynollah/eynollah.py
similarity index 100%
rename from sbb_newspapers_org_image/eynollah.py
rename to qurator/eynollah/eynollah.py
diff --git a/sbb_newspapers_org_image/plot.py b/qurator/eynollah/plot.py
similarity index 100%
rename from sbb_newspapers_org_image/plot.py
rename to qurator/eynollah/plot.py
diff --git a/sbb_newspapers_org_image/utils/__init__.py b/qurator/eynollah/utils/__init__.py
similarity index 100%
rename from sbb_newspapers_org_image/utils/__init__.py
rename to qurator/eynollah/utils/__init__.py
diff --git a/sbb_newspapers_org_image/utils/contour.py b/qurator/eynollah/utils/contour.py
similarity index 100%
rename from sbb_newspapers_org_image/utils/contour.py
rename to qurator/eynollah/utils/contour.py
diff --git a/sbb_newspapers_org_image/utils/drop_capitals.py b/qurator/eynollah/utils/drop_capitals.py
similarity index 100%
rename from sbb_newspapers_org_image/utils/drop_capitals.py
rename to qurator/eynollah/utils/drop_capitals.py
diff --git a/sbb_newspapers_org_image/utils/is_nan.py b/qurator/eynollah/utils/is_nan.py
similarity index 100%
rename from sbb_newspapers_org_image/utils/is_nan.py
rename to qurator/eynollah/utils/is_nan.py
diff --git a/sbb_newspapers_org_image/utils/marginals.py b/qurator/eynollah/utils/marginals.py
similarity index 100%
rename from sbb_newspapers_org_image/utils/marginals.py
rename to qurator/eynollah/utils/marginals.py
diff --git a/sbb_newspapers_org_image/utils/pil_cv2.py b/qurator/eynollah/utils/pil_cv2.py
similarity index 100%
rename from sbb_newspapers_org_image/utils/pil_cv2.py
rename to qurator/eynollah/utils/pil_cv2.py
diff --git a/sbb_newspapers_org_image/utils/resize.py b/qurator/eynollah/utils/resize.py
similarity index 100%
rename from sbb_newspapers_org_image/utils/resize.py
rename to qurator/eynollah/utils/resize.py
diff --git a/sbb_newspapers_org_image/utils/rotate.py b/qurator/eynollah/utils/rotate.py
similarity index 100%
rename from sbb_newspapers_org_image/utils/rotate.py
rename to qurator/eynollah/utils/rotate.py
diff --git a/sbb_newspapers_org_image/utils/separate_lines.py b/qurator/eynollah/utils/separate_lines.py
similarity index 100%
rename from sbb_newspapers_org_image/utils/separate_lines.py
rename to qurator/eynollah/utils/separate_lines.py
diff --git a/sbb_newspapers_org_image/utils/xml.py b/qurator/eynollah/utils/xml.py
similarity index 100%
rename from sbb_newspapers_org_image/utils/xml.py
rename to qurator/eynollah/utils/xml.py
diff --git a/sbb_newspapers_org_image/writer.py b/qurator/eynollah/writer.py
similarity index 100%
rename from sbb_newspapers_org_image/writer.py
rename to qurator/eynollah/writer.py
diff --git a/setup.py b/setup.py
index 8e34b43..7988aee 100644
--- a/setup.py
+++ b/setup.py
@@ -10,12 +10,13 @@ setup(
     author='Vahid Rezanezhad',
     url='https://github.com/qurator-spk/eynollah',
     license='Apache License 2.0',
-    packages=find_packages(),
+    namespace_packages=['qurator'],
+    packages=find_packages(exclude=['tests']),
     install_requires=install_requires,
     entry_points={
         'console_scripts': [
-            'eynollah=sbb_newspapers_org_image.cli:main',
-            # 'ocrd-eynollah=eynollah.ocrd_cli:cli',
+            'eynollah=qurator.eynollah.cli:main',
+            # 'ocrd-eynollah=qurator.eynollah.ocrd_cli:cli',
         ]
     },
 )
diff --git a/tests/test_dpi.py b/tests/test_dpi.py
index 59c5df4..380928d 100644
--- a/tests/test_dpi.py
+++ b/tests/test_dpi.py
@@ -1,5 +1,5 @@
 from pathlib import Path
-from sbb_newspapers_org_image.utils.pil_cv2 import check_dpi
+from qurator.eynollah.utils.pil_cv2 import check_dpi
 from tests.base import main
 
 def test_dpi():
diff --git a/tests/test_run.py b/tests/test_run.py
index 74f7fde..b1137e7 100644
--- a/tests/test_run.py
+++ b/tests/test_run.py
@@ -2,7 +2,7 @@ from os import environ
 from pathlib import Path
 from ocrd_utils import pushd_popd
 from tests.base import CapturingTestCase as TestCase, main
-from sbb_newspapers_org_image.cli import main as eynollah_cli
+from qurator.eynollah.cli import main as eynollah_cli
 
 testdir = Path(__file__).parent.resolve()
 
diff --git a/tests/test_smoke.py b/tests/test_smoke.py
index 7d1b381..d069479 100644
--- a/tests/test_smoke.py
+++ b/tests/test_smoke.py
@@ -1,7 +1,7 @@
 def test_utils_import():
-    import sbb_newspapers_org_image.utils
-    import sbb_newspapers_org_image.utils.contour
-    import sbb_newspapers_org_image.utils.drop_capitals
-    import sbb_newspapers_org_image.utils.drop_capitals
-    import sbb_newspapers_org_image.utils.is_nan
-    import sbb_newspapers_org_image.utils.rotate
+    import qurator.eynollah.utils
+    import qurator.eynollah.utils.contour
+    import qurator.eynollah.utils.drop_capitals
+    import qurator.eynollah.utils.drop_capitals
+    import qurator.eynollah.utils.is_nan
+    import qurator.eynollah.utils.rotate
diff --git a/tests/test_xml.py b/tests/test_xml.py
index fa0e793..052f91e 100644
--- a/tests/test_xml.py
+++ b/tests/test_xml.py
@@ -1,5 +1,5 @@
 from lxml import etree as ET
-from sbb_newspapers_org_image.utils.xml import create_page_xml, NAMESPACES
+from qurator.eynollah.utils.xml import create_page_xml, NAMESPACES
 
 def tostring(el):
     return ET.tostring(el).decode('utf-8')