do_work_of_slopes_new*, do_back_rotation_and_get_cnt_back, do_work_of_contours_in_image: use mp.Pool, simplify

pull/142/head
Robert Sachunsky 2 weeks ago
parent 25e967397d
commit 68456ea002

@ -11,8 +11,9 @@ import os
import sys import sys
import time import time
import warnings import warnings
from functools import partial
from pathlib import Path from pathlib import Path
from multiprocessing import Process, Queue, cpu_count from multiprocessing import Pool, cpu_count
import gc import gc
from ocrd_utils import getLogger from ocrd_utils import getLogger
import cv2 import cv2
@ -60,14 +61,20 @@ from .utils.contour import (
from .utils.rotate import ( from .utils.rotate import (
rotate_image, rotate_image,
rotation_not_90_func, rotation_not_90_func,
rotation_not_90_func_full_layout) rotation_not_90_func_full_layout
)
from .utils.separate_lines import ( from .utils.separate_lines import (
textline_contours_postprocessing, textline_contours_postprocessing,
separate_lines_new2, separate_lines_new2,
return_deskew_slop) return_deskew_slop,
do_work_of_slopes_new,
do_work_of_slopes_new_curved,
do_work_of_slopes_new_light,
)
from .utils.drop_capitals import ( from .utils.drop_capitals import (
adhere_drop_capital_region_into_corresponding_textline, adhere_drop_capital_region_into_corresponding_textline,
filter_small_drop_capitals_from_no_patch_layout) filter_small_drop_capitals_from_no_patch_layout
)
from .utils.marginals import get_marginals from .utils.marginals import get_marginals
from .utils.resize import resize_image from .utils.resize import resize_image
from .utils import ( from .utils import (
@ -82,7 +89,8 @@ from .utils import (
small_textlines_to_parent_adherence2, small_textlines_to_parent_adherence2,
order_of_regions, order_of_regions,
find_number_of_columns_in_document, find_number_of_columns_in_document,
return_boxes_of_images_by_order_of_reading_new) return_boxes_of_images_by_order_of_reading_new
)
from .utils.pil_cv2 import check_dpi, pil2cv from .utils.pil_cv2 import check_dpi, pil2cv
from .utils.xml import order_and_id_of_texts from .utils.xml import order_and_id_of_texts
from .plot import EynollahPlotter from .plot import EynollahPlotter
@ -1504,381 +1512,73 @@ class Eynollah:
all_box_coord.append(crop_coor) all_box_coord.append(crop_coor)
return slopes, all_found_textline_polygons, boxes, contours, contours_par, all_box_coord, np.array(range(len(contours_par))) return all_found_textline_polygons, boxes, contours, contours_par, all_box_coord, np.array(range(len(contours_par))), slopes
def get_slopes_and_deskew_new_light(self, contours, contours_par, textline_mask_tot, image_page_rotated, boxes, slope_deskew): def get_slopes_and_deskew_new_light(self, contours, contours_par, textline_mask_tot, image_page_rotated, boxes, slope_deskew):
self.logger.debug("enter get_slopes_and_deskew_new") if not len(contours):
return [], [], [], [], [], [], []
self.logger.debug("enter get_slopes_and_deskew_new_light")
if len(contours)>15: if len(contours)>15:
num_cores = cpu_count() num_cores = cpu_count()
else: else:
num_cores = 1 num_cores = 1
queue_of_all_params = Queue() with Pool(processes=num_cores) as pool:
results = pool.starmap(
processes = [] partial(do_work_of_slopes_new_light,
nh = np.linspace(0, len(boxes), num_cores + 1) textline_mask_tot_ea=textline_mask_tot,
indexes_by_text_con = np.array(range(len(contours_par))) image_page_rotated=image_page_rotated,
for i in range(num_cores): slope_deskew=slope_deskew,
boxes_per_process = boxes[int(nh[i]) : int(nh[i + 1])] logger=self.logger,
contours_per_process = contours[int(nh[i]) : int(nh[i + 1])] MAX_SLOPE=MAX_SLOPE,
contours_par_per_process = contours_par[int(nh[i]) : int(nh[i + 1])] KERNEL=KERNEL,
indexes_text_con_per_process = indexes_by_text_con[int(nh[i]) : int(nh[i + 1])] plotter=self.plotter,),
zip(boxes, contours, contours_par, range(len(contours_par))))
processes.append(Process(target=self.do_work_of_slopes_new_light, args=(queue_of_all_params, boxes_per_process, textline_mask_tot, contours_per_process, contours_par_per_process, indexes_text_con_per_process, image_page_rotated, slope_deskew))) #textline_polygons, boxes, text_regions, text_regions_par, box_coord, index_text_con, slopes = zip(*results)
for i in range(num_cores): self.logger.debug("exit get_slopes_and_deskew_new_light")
processes[i].start() return tuple(zip(*results))
slopes = []
all_found_textline_polygons = []
all_found_text_regions = []
all_found_text_regions_par = []
boxes = []
all_box_coord = []
all_index_text_con = []
for i in range(num_cores):
list_all_par = queue_of_all_params.get(True)
slopes_for_sub_process = list_all_par[0]
polys_for_sub_process = list_all_par[1]
boxes_for_sub_process = list_all_par[2]
contours_for_subprocess = list_all_par[3]
contours_par_for_subprocess = list_all_par[4]
boxes_coord_for_subprocess = list_all_par[5]
indexes_for_subprocess = list_all_par[6]
for j in range(len(slopes_for_sub_process)):
slopes.append(slopes_for_sub_process[j])
all_found_textline_polygons.append(polys_for_sub_process[j])
boxes.append(boxes_for_sub_process[j])
all_found_text_regions.append(contours_for_subprocess[j])
all_found_text_regions_par.append(contours_par_for_subprocess[j])
all_box_coord.append(boxes_coord_for_subprocess[j])
all_index_text_con.append(indexes_for_subprocess[j])
for i in range(num_cores):
processes[i].join()
self.logger.debug('slopes %s', slopes)
self.logger.debug("exit get_slopes_and_deskew_new")
return slopes, all_found_textline_polygons, boxes, all_found_text_regions, all_found_text_regions_par, all_box_coord, all_index_text_con
def get_slopes_and_deskew_new(self, contours, contours_par, textline_mask_tot, image_page_rotated, boxes, slope_deskew): def get_slopes_and_deskew_new(self, contours, contours_par, textline_mask_tot, image_page_rotated, boxes, slope_deskew):
if not len(contours):
return [], [], [], [], [], [], []
self.logger.debug("enter get_slopes_and_deskew_new") self.logger.debug("enter get_slopes_and_deskew_new")
num_cores = cpu_count() num_cores = cpu_count()
queue_of_all_params = Queue() with Pool(processes=num_cores) as pool:
results = pool.starmap(
processes = [] partial(do_work_of_slopes_new,
nh = np.linspace(0, len(boxes), num_cores + 1) textline_mask_tot_ea=textline_mask_tot,
indexes_by_text_con = np.array(range(len(contours_par))) image_page_rotated=image_page_rotated,
for i in range(num_cores): slope_deskew=slope_deskew,
boxes_per_process = boxes[int(nh[i]) : int(nh[i + 1])] logger=self.logger,
contours_per_process = contours[int(nh[i]) : int(nh[i + 1])] MAX_SLOPE=MAX_SLOPE,
contours_par_per_process = contours_par[int(nh[i]) : int(nh[i + 1])] KERNEL=KERNEL,
indexes_text_con_per_process = indexes_by_text_con[int(nh[i]) : int(nh[i + 1])] plotter=self.plotter,),
zip(boxes, contours, contours_par, range(len(contours_par))))
processes.append(Process(target=self.do_work_of_slopes_new, args=(queue_of_all_params, boxes_per_process, textline_mask_tot, contours_per_process, contours_par_per_process, indexes_text_con_per_process, image_page_rotated, slope_deskew))) #textline_polygons, boxes, text_regions, text_regions_par, box_coord, index_text_con, slopes = zip(*results)
for i in range(num_cores):
processes[i].start()
slopes = []
all_found_textline_polygons = []
all_found_text_regions = []
all_found_text_regions_par = []
boxes = []
all_box_coord = []
all_index_text_con = []
for i in range(num_cores):
list_all_par = queue_of_all_params.get(True)
slopes_for_sub_process = list_all_par[0]
polys_for_sub_process = list_all_par[1]
boxes_for_sub_process = list_all_par[2]
contours_for_subprocess = list_all_par[3]
contours_par_for_subprocess = list_all_par[4]
boxes_coord_for_subprocess = list_all_par[5]
indexes_for_subprocess = list_all_par[6]
for j in range(len(slopes_for_sub_process)):
slopes.append(slopes_for_sub_process[j])
all_found_textline_polygons.append(polys_for_sub_process[j])
boxes.append(boxes_for_sub_process[j])
all_found_text_regions.append(contours_for_subprocess[j])
all_found_text_regions_par.append(contours_par_for_subprocess[j])
all_box_coord.append(boxes_coord_for_subprocess[j])
all_index_text_con.append(indexes_for_subprocess[j])
for i in range(num_cores):
processes[i].join()
self.logger.debug('slopes %s', slopes)
self.logger.debug("exit get_slopes_and_deskew_new") self.logger.debug("exit get_slopes_and_deskew_new")
return slopes, all_found_textline_polygons, boxes, all_found_text_regions, all_found_text_regions_par, all_box_coord, all_index_text_con return tuple(zip(*results))
def get_slopes_and_deskew_new_curved(self, contours, contours_par, textline_mask_tot, image_page_rotated, boxes, mask_texts_only, num_col, scale_par, slope_deskew): def get_slopes_and_deskew_new_curved(self, contours, contours_par, textline_mask_tot, image_page_rotated, boxes, mask_texts_only, num_col, scale_par, slope_deskew):
if not len(contours):
return [], [], [], [], [], [], []
self.logger.debug("enter get_slopes_and_deskew_new_curved") self.logger.debug("enter get_slopes_and_deskew_new_curved")
num_cores = cpu_count() num_cores = cpu_count()
queue_of_all_params = Queue() with Pool(processes=num_cores) as pool:
results = pool.starmap(
processes = [] partial(do_work_of_slopes_new_curved,
nh = np.linspace(0, len(boxes), num_cores + 1) textline_mask_tot_ea=textline_mask_tot,
indexes_by_text_con = np.array(range(len(contours_par))) image_page_rotated=image_page_rotated,
mask_texts_only=mask_texts_only,
for i in range(num_cores): num_col=num_col,
boxes_per_process = boxes[int(nh[i]) : int(nh[i + 1])] scale_par=scale_par,
contours_per_process = contours[int(nh[i]) : int(nh[i + 1])] slope_deskew=slope_deskew,
contours_par_per_process = contours_par[int(nh[i]) : int(nh[i + 1])] logger=self.logger,
indexes_text_con_per_process = indexes_by_text_con[int(nh[i]) : int(nh[i + 1])] MAX_SLOPE=MAX_SLOPE,
KERNEL=KERNEL,
processes.append(Process(target=self.do_work_of_slopes_new_curved, args=(queue_of_all_params, boxes_per_process, textline_mask_tot, contours_per_process, contours_par_per_process, image_page_rotated, mask_texts_only, num_col, scale_par, indexes_text_con_per_process, slope_deskew))) plotter=self.plotter,),
zip(boxes, contours, contours_par, range(len(contours_par))))
for i in range(num_cores): #textline_polygons, boxes, text_regions, text_regions_par, box_coord, index_text_con, slopes = zip(*results)
processes[i].start() self.logger.debug("exit get_slopes_and_deskew_new_curved")
return tuple(zip(*results))
slopes = []
all_found_textline_polygons = []
all_found_text_regions = []
all_found_text_regions_par = []
boxes = []
all_box_coord = []
all_index_text_con = []
for i in range(num_cores):
list_all_par = queue_of_all_params.get(True)
polys_for_sub_process = list_all_par[0]
boxes_for_sub_process = list_all_par[1]
contours_for_subprocess = list_all_par[2]
contours_par_for_subprocess = list_all_par[3]
boxes_coord_for_subprocess = list_all_par[4]
indexes_for_subprocess = list_all_par[5]
slopes_for_sub_process = list_all_par[6]
for j in range(len(polys_for_sub_process)):
slopes.append(slopes_for_sub_process[j])
all_found_textline_polygons.append(polys_for_sub_process[j][::-1])
boxes.append(boxes_for_sub_process[j])
all_found_text_regions.append(contours_for_subprocess[j])
all_found_text_regions_par.append(contours_par_for_subprocess[j])
all_box_coord.append(boxes_coord_for_subprocess[j])
all_index_text_con.append(indexes_for_subprocess[j])
for i in range(num_cores):
processes[i].join()
# print(slopes,'slopes')
return all_found_textline_polygons, boxes, all_found_text_regions, all_found_text_regions_par, all_box_coord, all_index_text_con, slopes
def do_work_of_slopes_new_curved(self, queue_of_all_params, boxes_text, textline_mask_tot_ea, contours_per_process, contours_par_per_process, image_page_rotated, mask_texts_only, num_col, scale_par, indexes_r_con_per_pro, slope_deskew):
self.logger.debug("enter do_work_of_slopes_new_curved")
slopes_per_each_subprocess = []
bounding_box_of_textregion_per_each_subprocess = []
textlines_rectangles_per_each_subprocess = []
contours_textregion_per_each_subprocess = []
contours_textregion_par_per_each_subprocess = []
all_box_coord_per_process = []
index_by_text_region_contours = []
textline_cnt_separated = np.zeros(textline_mask_tot_ea.shape)
for mv in range(len(boxes_text)):
all_text_region_raw = textline_mask_tot_ea[boxes_text[mv][1] : boxes_text[mv][1] + boxes_text[mv][3], boxes_text[mv][0] : boxes_text[mv][0] + boxes_text[mv][2]]
all_text_region_raw = all_text_region_raw.astype(np.uint8)
img_int_p = all_text_region_raw[:, :]
# img_int_p=cv2.erode(img_int_p,KERNEL,iterations = 2)
# plt.imshow(img_int_p)
# plt.show()
if img_int_p.shape[0] / img_int_p.shape[1] < 0.1:
slopes_per_each_subprocess.append(0)
slope_for_all = [slope_deskew][0]
else:
try:
textline_con, hierarchy = return_contours_of_image(img_int_p)
textline_con_fil = filter_contours_area_of_image(img_int_p, textline_con, hierarchy, max_area=1, min_area=0.0008)
y_diff_mean = find_contours_mean_y_diff(textline_con_fil)
if self.isNaN(y_diff_mean):
slope_for_all = MAX_SLOPE
else:
sigma_des = max(1, int(y_diff_mean * (4.0 / 40.0)))
img_int_p[img_int_p > 0] = 1
slope_for_all = return_deskew_slop(img_int_p, sigma_des, plotter=self.plotter)
if abs(slope_for_all) < 0.5:
slope_for_all = [slope_deskew][0]
except Exception as why:
self.logger.error(why)
slope_for_all = MAX_SLOPE
if slope_for_all == MAX_SLOPE:
slope_for_all = [slope_deskew][0]
slopes_per_each_subprocess.append(slope_for_all)
index_by_text_region_contours.append(indexes_r_con_per_pro[mv])
_, crop_coor = crop_image_inside_box(boxes_text[mv], image_page_rotated)
if abs(slope_for_all) < 45:
# all_box_coord.append(crop_coor)
textline_region_in_image = np.zeros(textline_mask_tot_ea.shape)
cnt_o_t_max = contours_par_per_process[mv]
x, y, w, h = cv2.boundingRect(cnt_o_t_max)
mask_biggest = np.zeros(mask_texts_only.shape)
mask_biggest = cv2.fillPoly(mask_biggest, pts=[cnt_o_t_max], color=(1, 1, 1))
mask_region_in_patch_region = mask_biggest[y : y + h, x : x + w]
textline_biggest_region = mask_biggest * textline_mask_tot_ea
# print(slope_for_all,'slope_for_all')
textline_rotated_separated = separate_lines_new2(textline_biggest_region[y : y + h, x : x + w], 0, num_col, slope_for_all, plotter=self.plotter)
# new line added
##print(np.shape(textline_rotated_separated),np.shape(mask_biggest))
textline_rotated_separated[mask_region_in_patch_region[:, :] != 1] = 0
# till here
textline_cnt_separated[y : y + h, x : x + w] = textline_rotated_separated
textline_region_in_image[y : y + h, x : x + w] = textline_rotated_separated
# plt.imshow(textline_region_in_image)
# plt.show()
# plt.imshow(textline_cnt_separated)
# plt.show()
pixel_img = 1
cnt_textlines_in_image = return_contours_of_interested_textline(textline_region_in_image, pixel_img)
textlines_cnt_per_region = []
for jjjj in range(len(cnt_textlines_in_image)):
mask_biggest2 = np.zeros(mask_texts_only.shape)
mask_biggest2 = cv2.fillPoly(mask_biggest2, pts=[cnt_textlines_in_image[jjjj]], color=(1, 1, 1))
if num_col + 1 == 1:
mask_biggest2 = cv2.dilate(mask_biggest2, KERNEL, iterations=5)
else:
mask_biggest2 = cv2.dilate(mask_biggest2, KERNEL, iterations=4)
pixel_img = 1
mask_biggest2 = resize_image(mask_biggest2, int(mask_biggest2.shape[0] * scale_par), int(mask_biggest2.shape[1] * scale_par))
cnt_textlines_in_image_ind = return_contours_of_interested_textline(mask_biggest2, pixel_img)
try:
textlines_cnt_per_region.append(cnt_textlines_in_image_ind[0])
except Exception as why:
self.logger.error(why)
else:
add_boxes_coor_into_textlines = True
textlines_cnt_per_region = textline_contours_postprocessing(all_text_region_raw, slope_for_all, contours_par_per_process[mv], boxes_text[mv], add_boxes_coor_into_textlines)
add_boxes_coor_into_textlines = False
# print(np.shape(textlines_cnt_per_region),'textlines_cnt_per_region')
textlines_rectangles_per_each_subprocess.append(textlines_cnt_per_region)
bounding_box_of_textregion_per_each_subprocess.append(boxes_text[mv])
contours_textregion_per_each_subprocess.append(contours_per_process[mv])
contours_textregion_par_per_each_subprocess.append(contours_par_per_process[mv])
all_box_coord_per_process.append(crop_coor)
queue_of_all_params.put([textlines_rectangles_per_each_subprocess, bounding_box_of_textregion_per_each_subprocess, contours_textregion_per_each_subprocess, contours_textregion_par_per_each_subprocess, all_box_coord_per_process, index_by_text_region_contours, slopes_per_each_subprocess])
def do_work_of_slopes_new_light(self, queue_of_all_params, boxes_text, textline_mask_tot_ea, contours_per_process, contours_par_per_process, indexes_r_con_per_pro, image_page_rotated, slope_deskew):
self.logger.debug('enter do_work_of_slopes_new_light')
slopes_per_each_subprocess = []
bounding_box_of_textregion_per_each_subprocess = []
textlines_rectangles_per_each_subprocess = []
contours_textregion_per_each_subprocess = []
contours_textregion_par_per_each_subprocess = []
all_box_coord_per_process = []
index_by_text_region_contours = []
for mv in range(len(boxes_text)):
_, crop_coor = crop_image_inside_box(boxes_text[mv],image_page_rotated)
mask_textline = np.zeros((textline_mask_tot_ea.shape))
mask_textline = cv2.fillPoly(mask_textline,pts=[contours_per_process[mv]],color=(1,1,1))
all_text_region_raw = (textline_mask_tot_ea*mask_textline[:,:])[boxes_text[mv][1]:boxes_text[mv][1]+boxes_text[mv][3] , boxes_text[mv][0]:boxes_text[mv][0]+boxes_text[mv][2] ]
all_text_region_raw=all_text_region_raw.astype(np.uint8)
slopes_per_each_subprocess.append([slope_deskew][0])
mask_only_con_region = np.zeros(textline_mask_tot_ea.shape)
mask_only_con_region = cv2.fillPoly(mask_only_con_region, pts=[contours_par_per_process[mv]], color=(1, 1, 1))
if self.textline_light:
all_text_region_raw = np.copy(textline_mask_tot_ea)
all_text_region_raw[mask_only_con_region == 0] = 0
cnt_clean_rot_raw, hir_on_cnt_clean_rot = return_contours_of_image(all_text_region_raw)
cnt_clean_rot = filter_contours_area_of_image(all_text_region_raw, cnt_clean_rot_raw, hir_on_cnt_clean_rot, max_area=1, min_area=0.00001)
else:
all_text_region_raw = np.copy(textline_mask_tot_ea[boxes_text[mv][1] : boxes_text[mv][1] + boxes_text[mv][3], boxes_text[mv][0] : boxes_text[mv][0] + boxes_text[mv][2]])
mask_only_con_region = mask_only_con_region[boxes_text[mv][1] : boxes_text[mv][1] + boxes_text[mv][3], boxes_text[mv][0] : boxes_text[mv][0] + boxes_text[mv][2]]
all_text_region_raw[mask_only_con_region == 0] = 0
cnt_clean_rot = textline_contours_postprocessing(all_text_region_raw, [slope_deskew][0], contours_par_per_process[mv], boxes_text[mv])
textlines_rectangles_per_each_subprocess.append(cnt_clean_rot)
index_by_text_region_contours.append(indexes_r_con_per_pro[mv])
bounding_box_of_textregion_per_each_subprocess.append(boxes_text[mv])
contours_textregion_per_each_subprocess.append(contours_per_process[mv])
contours_textregion_par_per_each_subprocess.append(contours_par_per_process[mv])
all_box_coord_per_process.append(crop_coor)
queue_of_all_params.put([slopes_per_each_subprocess, textlines_rectangles_per_each_subprocess, bounding_box_of_textregion_per_each_subprocess, contours_textregion_per_each_subprocess, contours_textregion_par_per_each_subprocess, all_box_coord_per_process, index_by_text_region_contours])
def do_work_of_slopes_new(self, queue_of_all_params, boxes_text, textline_mask_tot_ea, contours_per_process, contours_par_per_process, indexes_r_con_per_pro, image_page_rotated, slope_deskew):
self.logger.debug('enter do_work_of_slopes_new')
slopes_per_each_subprocess = []
bounding_box_of_textregion_per_each_subprocess = []
textlines_rectangles_per_each_subprocess = []
contours_textregion_per_each_subprocess = []
contours_textregion_par_per_each_subprocess = []
all_box_coord_per_process = []
index_by_text_region_contours = []
for mv in range(len(boxes_text)):
_, crop_coor = crop_image_inside_box(boxes_text[mv],image_page_rotated)
mask_textline = np.zeros((textline_mask_tot_ea.shape))
mask_textline = cv2.fillPoly(mask_textline,pts=[contours_per_process[mv]],color=(1,1,1))
all_text_region_raw = (textline_mask_tot_ea*mask_textline[:,:])[boxes_text[mv][1]:boxes_text[mv][1]+boxes_text[mv][3] , boxes_text[mv][0]:boxes_text[mv][0]+boxes_text[mv][2] ]
all_text_region_raw=all_text_region_raw.astype(np.uint8)
img_int_p=all_text_region_raw[:,:]#self.all_text_region_raw[mv]
img_int_p=cv2.erode(img_int_p,KERNEL,iterations = 2)
if img_int_p.shape[0]/img_int_p.shape[1]<0.1:
slopes_per_each_subprocess.append(0)
slope_for_all = [slope_deskew][0]
all_text_region_raw = textline_mask_tot_ea[boxes_text[mv][1] : boxes_text[mv][1] + boxes_text[mv][3], boxes_text[mv][0] : boxes_text[mv][0] + boxes_text[mv][2]]
cnt_clean_rot = textline_contours_postprocessing(all_text_region_raw, slope_for_all, contours_par_per_process[mv], boxes_text[mv], 0)
textlines_rectangles_per_each_subprocess.append(cnt_clean_rot)
index_by_text_region_contours.append(indexes_r_con_per_pro[mv])
bounding_box_of_textregion_per_each_subprocess.append(boxes_text[mv])
else:
try:
textline_con, hierarchy = return_contours_of_image(img_int_p)
textline_con_fil = filter_contours_area_of_image(img_int_p, textline_con, hierarchy, max_area=1, min_area=0.00008)
y_diff_mean = find_contours_mean_y_diff(textline_con_fil)
if self.isNaN(y_diff_mean):
slope_for_all = MAX_SLOPE
else:
sigma_des = int(y_diff_mean * (4.0 / 40.0))
if sigma_des < 1:
sigma_des = 1
img_int_p[img_int_p > 0] = 1
slope_for_all = return_deskew_slop(img_int_p, sigma_des, plotter=self.plotter)
if abs(slope_for_all) <= 0.5:
slope_for_all = [slope_deskew][0]
except Exception as why:
self.logger.error(why)
slope_for_all = MAX_SLOPE
if slope_for_all == MAX_SLOPE:
slope_for_all = [slope_deskew][0]
slopes_per_each_subprocess.append(slope_for_all)
mask_only_con_region = np.zeros(textline_mask_tot_ea.shape)
mask_only_con_region = cv2.fillPoly(mask_only_con_region, pts=[contours_par_per_process[mv]], color=(1, 1, 1))
# plt.imshow(mask_only_con_region)
# plt.show()
all_text_region_raw = np.copy(textline_mask_tot_ea[boxes_text[mv][1] : boxes_text[mv][1] + boxes_text[mv][3], boxes_text[mv][0] : boxes_text[mv][0] + boxes_text[mv][2]])
mask_only_con_region = mask_only_con_region[boxes_text[mv][1] : boxes_text[mv][1] + boxes_text[mv][3], boxes_text[mv][0] : boxes_text[mv][0] + boxes_text[mv][2]]
##plt.imshow(textline_mask_tot_ea)
##plt.show()
##plt.imshow(all_text_region_raw)
##plt.show()
##plt.imshow(mask_only_con_region)
##plt.show()
all_text_region_raw[mask_only_con_region == 0] = 0
cnt_clean_rot = textline_contours_postprocessing(all_text_region_raw, slope_for_all, contours_par_per_process[mv], boxes_text[mv])
textlines_rectangles_per_each_subprocess.append(cnt_clean_rot)
index_by_text_region_contours.append(indexes_r_con_per_pro[mv])
bounding_box_of_textregion_per_each_subprocess.append(boxes_text[mv])
contours_textregion_per_each_subprocess.append(contours_per_process[mv])
contours_textregion_par_per_each_subprocess.append(contours_par_per_process[mv])
all_box_coord_per_process.append(crop_coor)
queue_of_all_params.put([slopes_per_each_subprocess, textlines_rectangles_per_each_subprocess, bounding_box_of_textregion_per_each_subprocess, contours_textregion_per_each_subprocess, contours_textregion_par_per_each_subprocess, all_box_coord_per_process, index_by_text_region_contours])
def textline_contours(self, img, use_patches, scaler_h, scaler_w, num_col_classifier=None): def textline_contours(self, img, use_patches, scaler_h, scaler_w, num_col_classifier=None):
self.logger.debug('enter textline_contours') self.logger.debug('enter textline_contours')
@ -1923,6 +1623,7 @@ class Eynollah:
prediction_textline_longshot = self.do_prediction(False, img, self.model_textline) prediction_textline_longshot = self.do_prediction(False, img, self.model_textline)
prediction_textline_longshot_true_size = resize_image(prediction_textline_longshot, img_h, img_w) prediction_textline_longshot_true_size = resize_image(prediction_textline_longshot, img_h, img_w)
self.logger.debug('exit textline_contours')
return ((prediction_textline[:, :, 0]==1)*1).astype('uint8'), ((prediction_textline_longshot_true_size[:, :, 0]==1)*1).astype('uint8') return ((prediction_textline[:, :, 0]==1)*1).astype('uint8'), ((prediction_textline_longshot_true_size[:, :, 0]==1)*1).astype('uint8')
@ -1959,6 +1660,7 @@ class Eynollah:
q.put(slopes_sub) q.put(slopes_sub)
poly.put(poly_sub) poly.put(poly_sub)
box_sub.put(boxes_sub_new) box_sub.put(boxes_sub_new)
self.logger.debug('exit do_work_of_slopes')
def get_regions_light_v_extract_only_images(self,img,is_image_enhanced, num_col_classifier): def get_regions_light_v_extract_only_images(self,img,is_image_enhanced, num_col_classifier):
self.logger.debug("enter get_regions_extract_images_only") self.logger.debug("enter get_regions_extract_images_only")
@ -2069,6 +1771,7 @@ class Eynollah:
polygons_of_images_fin.append(np.array([[page_coord_img[2], page_coord_img[0]], [page_coord_img[3], page_coord_img[0]], [page_coord_img[3], page_coord_img[1]], [page_coord_img[2], page_coord_img[1]]]) ) polygons_of_images_fin.append(np.array([[page_coord_img[2], page_coord_img[0]], [page_coord_img[3], page_coord_img[0]], [page_coord_img[3], page_coord_img[1]], [page_coord_img[2], page_coord_img[1]]]) )
self.logger.debug("exit get_regions_extract_images_only")
return text_regions_p_true, erosion_hurts, polygons_lines_xml, polygons_of_images_fin, image_page, page_coord, cont_page return text_regions_p_true, erosion_hurts, polygons_lines_xml, polygons_of_images_fin, image_page, page_coord, cont_page
def get_regions_light_v(self,img,is_image_enhanced, num_col_classifier, skip_layout_and_reading_order=False): def get_regions_light_v(self,img,is_image_enhanced, num_col_classifier, skip_layout_and_reading_order=False):
@ -2146,6 +1849,7 @@ class Eynollah:
#print("inside 1 ", time.time()-t_in) #print("inside 1 ", time.time()-t_in)
###textline_mask_tot_ea = self.run_textline(img_bin) ###textline_mask_tot_ea = self.run_textline(img_bin)
self.logger.debug("detecting textlines on %s with %d colors", str(img_resized.shape), len(np.unique(img_resized)))
textline_mask_tot_ea = self.run_textline(img_resized, num_col_classifier) textline_mask_tot_ea = self.run_textline(img_resized, num_col_classifier)
@ -2269,9 +1973,11 @@ class Eynollah:
#plt.imshow(textline_mask_tot_ea) #plt.imshow(textline_mask_tot_ea)
#plt.show() #plt.show()
#print("inside 4 ", time.time()-t_in) #print("inside 4 ", time.time()-t_in)
self.logger.debug("exit get_regions_light_v")
return text_regions_p_true, erosion_hurts, polygons_lines_xml, textline_mask_tot_ea, img_bin return text_regions_p_true, erosion_hurts, polygons_lines_xml, textline_mask_tot_ea, img_bin
else: else:
img_bin = resize_image(img_bin,img_height_h, img_width_h ) img_bin = resize_image(img_bin,img_height_h, img_width_h )
self.logger.debug("exit get_regions_light_v")
return None, erosion_hurts, None, textline_mask_tot_ea, img_bin return None, erosion_hurts, None, textline_mask_tot_ea, img_bin
def get_regions_from_xy_2models(self,img,is_image_enhanced, num_col_classifier): def get_regions_from_xy_2models(self,img,is_image_enhanced, num_col_classifier):
@ -2392,6 +2098,7 @@ class Eynollah:
text_regions_p_true=cv2.fillPoly(text_regions_p_true,pts=polygons_of_only_texts, color=(1,1,1)) text_regions_p_true=cv2.fillPoly(text_regions_p_true,pts=polygons_of_only_texts, color=(1,1,1))
self.logger.debug("exit get_regions_from_xy_2models")
return text_regions_p_true, erosion_hurts, polygons_lines_xml return text_regions_p_true, erosion_hurts, polygons_lines_xml
except: except:
@ -2461,6 +2168,7 @@ class Eynollah:
text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts = polygons_of_only_texts, color=(1,1,1)) text_regions_p_true = cv2.fillPoly(text_regions_p_true, pts = polygons_of_only_texts, color=(1,1,1))
erosion_hurts = True erosion_hurts = True
self.logger.debug("exit get_regions_from_xy_2models")
return text_regions_p_true, erosion_hurts, polygons_lines_xml return text_regions_p_true, erosion_hurts, polygons_lines_xml
def do_order_of_regions_full_layout(self, contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot): def do_order_of_regions_full_layout(self, contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot):
@ -2633,6 +2341,7 @@ class Eynollah:
for iii in range(len(order_of_texts_tot)): for iii in range(len(order_of_texts_tot)):
order_text_new.append(np.where(np.array(order_of_texts_tot) == iii)[0][0]) order_text_new.append(np.where(np.array(order_of_texts_tot) == iii)[0][0])
self.logger.debug("exit do_order_of_regions_full_layout")
return order_text_new, id_of_texts_tot return order_text_new, id_of_texts_tot
def do_order_of_regions_no_full_layout(self, contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot): def do_order_of_regions_no_full_layout(self, contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot):
@ -2743,6 +2452,7 @@ class Eynollah:
for iii in range(len(order_of_texts_tot)): for iii in range(len(order_of_texts_tot)):
order_text_new.append(np.where(np.array(order_of_texts_tot) == iii)[0][0]) order_text_new.append(np.where(np.array(order_of_texts_tot) == iii)[0][0])
self.logger.debug("exit do_order_of_regions_no_full_layout")
return order_text_new, id_of_texts_tot return order_text_new, id_of_texts_tot
def check_iou_of_bounding_box_and_contour_for_tables(self, layout, table_prediction_early, pixel_tabel, num_col_classifier): def check_iou_of_bounding_box_and_contour_for_tables(self, layout, table_prediction_early, pixel_tabel, num_col_classifier):
layout_org = np.copy(layout) layout_org = np.copy(layout)
@ -5051,12 +4761,12 @@ class Eynollah:
if not self.curved_line: if not self.curved_line:
if self.light_version: if self.light_version:
if self.textline_light: if self.textline_light:
#slopes, all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con = \ #all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con, slopes = \
# self.get_slopes_and_deskew_new_light(txt_con_org, contours_only_text_parent, textline_mask_tot_ea_org, image_page_rotated, boxes_text, slope_deskew) # self.get_slopes_and_deskew_new_light(txt_con_org, contours_only_text_parent, textline_mask_tot_ea_org, image_page_rotated, boxes_text, slope_deskew)
slopes, all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con = \ all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con, slopes = \
self.get_slopes_and_deskew_new_light2(txt_con_org, contours_only_text_parent, textline_mask_tot_ea_org, image_page_rotated, boxes_text, slope_deskew) self.get_slopes_and_deskew_new_light2(txt_con_org, contours_only_text_parent, textline_mask_tot_ea_org, image_page_rotated, boxes_text, slope_deskew)
slopes_marginals, all_found_textline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, _ = \ all_found_textline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, _, slopes_marginals = \
self.get_slopes_and_deskew_new_light(polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea_org, image_page_rotated, boxes_marginals, slope_deskew) self.get_slopes_and_deskew_new_light(polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea_org, image_page_rotated, boxes_marginals, slope_deskew)
#slopes, all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, index_by_text_par_con = \ #slopes, all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, index_by_text_par_con = \
@ -5074,17 +4784,17 @@ class Eynollah:
else: else:
textline_mask_tot_ea = cv2.erode(textline_mask_tot_ea, kernel=KERNEL, iterations=1) textline_mask_tot_ea = cv2.erode(textline_mask_tot_ea, kernel=KERNEL, iterations=1)
slopes, all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con = \ all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con, slopes = \
self.get_slopes_and_deskew_new_light(txt_con_org, contours_only_text_parent, textline_mask_tot_ea, image_page_rotated, boxes_text, slope_deskew) self.get_slopes_and_deskew_new_light(txt_con_org, contours_only_text_parent, textline_mask_tot_ea, image_page_rotated, boxes_text, slope_deskew)
slopes_marginals, all_found_textline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, _ = \ all_found_textline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, _, slopes_marginals = \
self.get_slopes_and_deskew_new_light(polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea, image_page_rotated, boxes_marginals, slope_deskew) self.get_slopes_and_deskew_new_light(polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea, image_page_rotated, boxes_marginals, slope_deskew)
#all_found_textline_polygons = self.filter_contours_inside_a_bigger_one(all_found_textline_polygons, textline_mask_tot_ea_org, type_contour="textline") #all_found_textline_polygons = self.filter_contours_inside_a_bigger_one(all_found_textline_polygons, textline_mask_tot_ea_org, type_contour="textline")
else: else:
textline_mask_tot_ea = cv2.erode(textline_mask_tot_ea, kernel=KERNEL, iterations=1) textline_mask_tot_ea = cv2.erode(textline_mask_tot_ea, kernel=KERNEL, iterations=1)
slopes, all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con = \ all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con, slopes = \
self.get_slopes_and_deskew_new(txt_con_org, contours_only_text_parent, textline_mask_tot_ea, image_page_rotated, boxes_text, slope_deskew) self.get_slopes_and_deskew_new(txt_con_org, contours_only_text_parent, textline_mask_tot_ea, image_page_rotated, boxes_text, slope_deskew)
slopes_marginals, all_found_textline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, _ = \ all_found_textline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, _, slopes_marginals = \
self.get_slopes_and_deskew_new(polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea, image_page_rotated, boxes_marginals, slope_deskew) self.get_slopes_and_deskew_new(polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea, image_page_rotated, boxes_marginals, slope_deskew)
else: else:
scale_param = 1 scale_param = 1

@ -1,10 +1,11 @@
from functools import partial
from multiprocessing import cpu_count, Pool
import cv2 import cv2
import numpy as np import numpy as np
from shapely import geometry from shapely import geometry
from .rotate import rotate_image, rotation_image_new from .rotate import rotate_image, rotation_image_new
from multiprocessing import Process, Queue, cpu_count
from multiprocessing import Pool
def contours_in_same_horizon(cy_main_hor): def contours_in_same_horizon(cy_main_hor):
X1 = np.zeros((len(cy_main_hor), len(cy_main_hor))) X1 = np.zeros((len(cy_main_hor), len(cy_main_hor)))
X2 = np.zeros((len(cy_main_hor), len(cy_main_hor))) X2 = np.zeros((len(cy_main_hor), len(cy_main_hor)))
@ -29,7 +30,6 @@ def find_contours_mean_y_diff(contours_main):
def get_text_region_boxes_by_given_contours(contours): def get_text_region_boxes_by_given_contours(contours):
kernel = np.ones((5, 5), np.uint8) kernel = np.ones((5, 5), np.uint8)
boxes = [] boxes = []
contours_new = [] contours_new = []
@ -144,73 +144,11 @@ def return_contours_of_interested_region(region_pre_p, pixel, min_area=0.0002):
return contours_imgs return contours_imgs
def do_work_of_contours_in_image(queue_of_all_params, contours_per_process, indexes_r_con_per_pro, img, slope_first): def do_work_of_contours_in_image(contour, index_r_con, img, slope_first):
cnts_org_per_each_subprocess = []
index_by_text_region_contours = []
for mv in range(len(contours_per_process)):
index_by_text_region_contours.append(indexes_r_con_per_pro[mv])
img_copy = np.zeros(img.shape)
img_copy = cv2.fillPoly(img_copy, pts=[contours_per_process[mv]], color=(1, 1, 1))
img_copy = rotation_image_new(img_copy, -slope_first)
img_copy = img_copy.astype(np.uint8)
imgray = cv2.cvtColor(img_copy, cv2.COLOR_BGR2GRAY)
ret, thresh = cv2.threshold(imgray, 0, 255, 0)
cont_int, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
cont_int[0][:, 0, 0] = cont_int[0][:, 0, 0] + np.abs(img_copy.shape[1] - img.shape[1])
cont_int[0][:, 0, 1] = cont_int[0][:, 0, 1] + np.abs(img_copy.shape[0] - img.shape[0])
cnts_org_per_each_subprocess.append(cont_int[0])
queue_of_all_params.put([ cnts_org_per_each_subprocess, index_by_text_region_contours])
def get_textregion_contours_in_org_image_multi(cnts, img, slope_first):
num_cores = cpu_count()
queue_of_all_params = Queue()
processes = []
nh = np.linspace(0, len(cnts), num_cores + 1)
indexes_by_text_con = np.array(range(len(cnts)))
for i in range(num_cores):
contours_per_process = cnts[int(nh[i]) : int(nh[i + 1])]
indexes_text_con_per_process = indexes_by_text_con[int(nh[i]) : int(nh[i + 1])]
processes.append(Process(target=do_work_of_contours_in_image, args=(queue_of_all_params, contours_per_process, indexes_text_con_per_process, img,slope_first )))
for i in range(num_cores):
processes[i].start()
cnts_org = []
all_index_text_con = []
for i in range(num_cores):
list_all_par = queue_of_all_params.get(True)
contours_for_sub_process = list_all_par[0]
indexes_for_sub_process = list_all_par[1]
for j in range(len(contours_for_sub_process)):
cnts_org.append(contours_for_sub_process[j])
all_index_text_con.append(indexes_for_sub_process[j])
for i in range(num_cores):
processes[i].join()
print(all_index_text_con)
return cnts_org
def loop_contour_image(index_l, cnts,img, slope_first):
img_copy = np.zeros(img.shape) img_copy = np.zeros(img.shape)
img_copy = cv2.fillPoly(img_copy, pts=[cnts[index_l]], color=(1, 1, 1)) img_copy = cv2.fillPoly(img_copy, pts=[contour], color=(1, 1, 1))
# plt.imshow(img_copy)
# plt.show()
# print(img.shape,'img')
img_copy = rotation_image_new(img_copy, -slope_first) img_copy = rotation_image_new(img_copy, -slope_first)
##print(img_copy.shape,'img_copy')
# plt.imshow(img_copy)
# plt.show()
img_copy = img_copy.astype(np.uint8) img_copy = img_copy.astype(np.uint8)
imgray = cv2.cvtColor(img_copy, cv2.COLOR_BGR2GRAY) imgray = cv2.cvtColor(img_copy, cv2.COLOR_BGR2GRAY)
@ -220,17 +158,22 @@ def loop_contour_image(index_l, cnts,img, slope_first):
cont_int[0][:, 0, 0] = cont_int[0][:, 0, 0] + np.abs(img_copy.shape[1] - img.shape[1]) cont_int[0][:, 0, 0] = cont_int[0][:, 0, 0] + np.abs(img_copy.shape[1] - img.shape[1])
cont_int[0][:, 0, 1] = cont_int[0][:, 0, 1] + np.abs(img_copy.shape[0] - img.shape[0]) cont_int[0][:, 0, 1] = cont_int[0][:, 0, 1] + np.abs(img_copy.shape[0] - img.shape[0])
# print(np.shape(cont_int[0]))
return cont_int[0]
def get_textregion_contours_in_org_image_multi2(cnts, img, slope_first):
cnts_org = [] return cont_int[0], index_r_con
# print(cnts,'cnts')
with Pool(cpu_count()) as p:
cnts_org = p.starmap(loop_contour_image, [(index_l,cnts, img,slope_first) for index_l in range(len(cnts))])
return cnts_org def get_textregion_contours_in_org_image_multi(cnts, img, slope_first):
if not len(cnts):
return [], []
num_cores = cpu_count()
with Pool(processes=num_cores) as pool:
results = pool.starmap(
partial(do_work_of_contours_in_image,
img=img,
slope_first=slope_first,
),
zip(cnts, range(len(cnts))))
return tuple(zip(*results))
def get_textregion_contours_in_org_image(cnts, img, slope_first): def get_textregion_contours_in_org_image(cnts, img, slope_first):
@ -292,69 +235,40 @@ def get_textregion_contours_in_org_image_light_old(cnts, img, slope_first):
return cnts_org return cnts_org
def return_list_of_contours_with_desired_order(ls_cons, sorted_indexes): def do_back_rotation_and_get_cnt_back(contour_par, index_r_con, img, slope_first):
return [ls_cons[sorted_indexes[index]] for index in range(len(sorted_indexes))] img_copy = np.zeros(img.shape)
def do_back_rotation_and_get_cnt_back(queue_of_all_params, contours_par_per_process,indexes_r_con_per_pro, img, slope_first): img_copy = cv2.fillPoly(img_copy, pts=[contour_par], color=(1, 1, 1))
contours_textregion_per_each_subprocess = []
index_by_text_region_contours = []
for mv in range(len(contours_par_per_process)):
img_copy = np.zeros(img.shape)
img_copy = cv2.fillPoly(img_copy, pts=[contours_par_per_process[mv]], color=(1, 1, 1))
img_copy = rotation_image_new(img_copy, -slope_first)
img_copy = img_copy.astype(np.uint8) img_copy = rotation_image_new(img_copy, -slope_first)
imgray = cv2.cvtColor(img_copy, cv2.COLOR_BGR2GRAY)
ret, thresh = cv2.threshold(imgray, 0, 255, 0)
cont_int, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) img_copy = img_copy.astype(np.uint8)
imgray = cv2.cvtColor(img_copy, cv2.COLOR_BGR2GRAY)
ret, thresh = cv2.threshold(imgray, 0, 255, 0)
cont_int[0][:, 0, 0] = cont_int[0][:, 0, 0] + np.abs(img_copy.shape[1] - img.shape[1]) cont_int, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
cont_int[0][:, 0, 1] = cont_int[0][:, 0, 1] + np.abs(img_copy.shape[0] - img.shape[0])
# print(np.shape(cont_int[0]))
contours_textregion_per_each_subprocess.append(cont_int[0]*6)
index_by_text_region_contours.append(indexes_r_con_per_pro[mv])
queue_of_all_params.put([contours_textregion_per_each_subprocess, index_by_text_region_contours]) cont_int[0][:, 0, 0] = cont_int[0][:, 0, 0] + np.abs(img_copy.shape[1] - img.shape[1])
cont_int[0][:, 0, 1] = cont_int[0][:, 0, 1] + np.abs(img_copy.shape[0] - img.shape[0])
# print(np.shape(cont_int[0]))
return cont_int[0], index_r_con
def get_textregion_contours_in_org_image_light(cnts, img, slope_first): def get_textregion_contours_in_org_image_light(cnts, img, slope_first):
num_cores = cpu_count() if not len(cnts):
queue_of_all_params = Queue() return []
processes = [] img = cv2.resize(img, (int(img.shape[1]/6), int(img.shape[0]/6)), interpolation=cv2.INTER_NEAREST)
nh = np.linspace(0, len(cnts), num_cores + 1)
indexes_by_text_con = np.array(range(len(cnts)))
h_o = img.shape[0]
w_o = img.shape[1]
img = cv2.resize(img, (int(img.shape[1]/6.), int(img.shape[0]/6.)), interpolation=cv2.INTER_NEAREST)
##cnts = list( (np.array(cnts)/2).astype(np.int16) ) ##cnts = list( (np.array(cnts)/2).astype(np.int16) )
#cnts = cnts/2 #cnts = cnts/2
cnts = [(i/ 6).astype(np.int32) for i in cnts] cnts = [(i/6).astype(np.int) for i in cnts]
num_cores = cpu_count()
for i in range(num_cores): with Pool(processes=num_cores) as pool:
contours_par_per_process = cnts[int(nh[i]) : int(nh[i + 1])] results = pool.starmap(
indexes_text_con_per_process = indexes_by_text_con[int(nh[i]) : int(nh[i + 1])] partial(do_back_rotation_and_get_cnt_back,
processes.append(Process(target=do_back_rotation_and_get_cnt_back, args=(queue_of_all_params, contours_par_per_process, indexes_text_con_per_process, img, slope_first))) img=img,
slope_first=slope_first,
for i in range(num_cores): ),
processes[i].start() zip(cnts, range(len(cnts))))
contours, indexes = tuple(zip(*results))
cnts_org = [] return [i*6 for i in contours]
all_index_text_con = []
for i in range(num_cores):
list_all_par = queue_of_all_params.get(True)
contours_for_subprocess = list_all_par[0]
indexes_for_subprocess = list_all_par[1]
for j in range(len(contours_for_subprocess)):
cnts_org.append(contours_for_subprocess[j])
all_index_text_con.append(indexes_for_subprocess[j])
for i in range(num_cores):
processes[i].join()
cnts_org = return_list_of_contours_with_desired_order(cnts_org, all_index_text_con)
return cnts_org
def return_contours_of_interested_textline(region_pre_p, pixel): def return_contours_of_interested_textline(region_pre_p, pixel):

@ -1,22 +1,23 @@
import os
from functools import partial from functools import partial
from multiprocessing import Pool, cpu_count
import numpy as np import numpy as np
import cv2 import cv2
from scipy.signal import find_peaks from scipy.signal import find_peaks
from scipy.ndimage import gaussian_filter1d from scipy.ndimage import gaussian_filter1d
import os
from multiprocessing import Process, Queue, cpu_count
from multiprocessing import Pool
from .rotate import rotate_image from .rotate import rotate_image
from .resize import resize_image
from .contour import ( from .contour import (
return_parent_contours, return_parent_contours,
filter_contours_area_of_image_tables, filter_contours_area_of_image_tables,
return_contours_of_image, return_contours_of_image,
filter_contours_area_of_image filter_contours_area_of_image,
return_contours_of_interested_textline,
find_contours_mean_y_diff,
) )
from .is_nan import isNaN
from . import ( from . import (
find_num_col_deskew, find_num_col_deskew,
isNaN, crop_image_inside_box,
) )
def dedup_separate_lines(img_patch, contour_text_interest, thetha, axis): def dedup_separate_lines(img_patch, contour_text_interest, thetha, axis):
@ -1249,13 +1250,13 @@ def separate_lines_new_inside_tiles(img_path, thetha):
forest.append(peaks_neg[i + 1]) forest.append(peaks_neg[i + 1])
if diff_peaks[i] > cut_off: if diff_peaks[i] > cut_off:
# print(forest[np.argmin(z[forest]) ] ) # print(forest[np.argmin(z[forest]) ] )
if not isNaN(forest[np.argmin(z[forest])]): if not np.isnan(forest[np.argmin(z[forest])]):
peaks_neg_true.append(forest[np.argmin(z[forest])]) peaks_neg_true.append(forest[np.argmin(z[forest])])
forest = [] forest = []
forest.append(peaks_neg[i + 1]) forest.append(peaks_neg[i + 1])
if i == (len(peaks_neg) - 1): if i == (len(peaks_neg) - 1):
# print(print(forest[np.argmin(z[forest]) ] )) # print(print(forest[np.argmin(z[forest]) ] ))
if not isNaN(forest[np.argmin(z[forest])]): if not np.isnan(forest[np.argmin(z[forest])]):
peaks_neg_true.append(forest[np.argmin(z[forest])]) peaks_neg_true.append(forest[np.argmin(z[forest])])
diff_peaks_pos = np.abs(np.diff(peaks)) diff_peaks_pos = np.abs(np.diff(peaks))
@ -1272,13 +1273,13 @@ def separate_lines_new_inside_tiles(img_path, thetha):
forest.append(peaks[i + 1]) forest.append(peaks[i + 1])
if diff_peaks_pos[i] > cut_off: if diff_peaks_pos[i] > cut_off:
# print(forest[np.argmin(z[forest]) ] ) # print(forest[np.argmin(z[forest]) ] )
if not isNaN(forest[np.argmax(z[forest])]): if not np.isnan(forest[np.argmax(z[forest])]):
peaks_pos_true.append(forest[np.argmax(z[forest])]) peaks_pos_true.append(forest[np.argmax(z[forest])])
forest = [] forest = []
forest.append(peaks[i + 1]) forest.append(peaks[i + 1])
if i == (len(peaks) - 1): if i == (len(peaks) - 1):
# print(print(forest[np.argmin(z[forest]) ] )) # print(print(forest[np.argmin(z[forest]) ] ))
if not isNaN(forest[np.argmax(z[forest])]): if not np.isnan(forest[np.argmax(z[forest])]):
peaks_pos_true.append(forest[np.argmax(z[forest])]) peaks_pos_true.append(forest[np.argmax(z[forest])])
# print(len(peaks_neg_true) ,len(peaks_pos_true) ,'lensss') # print(len(peaks_neg_true) ,len(peaks_pos_true) ,'lensss')
@ -1658,3 +1659,189 @@ def get_smallest_skew(img, sigma_des, angles, num_cores=1, plotter=None):
except: except:
angle = 0 angle = 0
return angle return angle
def do_work_of_slopes_new(
box_text, contour, contour_par, index_r_con,
textline_mask_tot_ea, image_page_rotated, slope_deskew,
logger, MAX_SLOPE=999, KERNEL=None, plotter=None
):
logger.debug('enter do_work_of_slopes_new')
if KERNEL is None:
KERNEL = np.ones((5, 5), np.uint8)
x, y, w, h = box_text
_, crop_coor = crop_image_inside_box(box_text, image_page_rotated)
mask_textline = np.zeros(textline_mask_tot_ea.shape)
mask_textline = cv2.fillPoly(mask_textline, pts=[contour], color=(1,1,1))
all_text_region_raw = textline_mask_tot_ea * mask_textline
all_text_region_raw = all_text_region_raw[y: y + h, x: x + w].astype(np.uint8)
img_int_p = all_text_region_raw[:,:]
img_int_p = cv2.erode(img_int_p, KERNEL, iterations=2)
if img_int_p.shape[0] /img_int_p.shape[1] < 0.1:
slope = 0
slope_for_all = slope_deskew
all_text_region_raw = textline_mask_tot_ea[y: y + h, x: x + w]
cnt_clean_rot = textline_contours_postprocessing(all_text_region_raw, slope_for_all, contour_par, box_text, 0)
else:
try:
textline_con, hierarchy = return_contours_of_image(img_int_p)
textline_con_fil = filter_contours_area_of_image(img_int_p, textline_con, hierarchy, max_area=1, min_area=0.00008)
y_diff_mean = find_contours_mean_y_diff(textline_con_fil)
if np.isnan(y_diff_mean):
slope_for_all = MAX_SLOPE
else:
sigma_des = max(1, int(y_diff_mean * (4.0 / 40.0)))
img_int_p[img_int_p > 0] = 1
slope_for_all = return_deskew_slop(img_int_p, sigma_des, plotter=plotter)
if abs(slope_for_all) <= 0.5:
slope_for_all = slope_deskew
except Exception as why:
logger.error(why)
slope_for_all = MAX_SLOPE
if slope_for_all == MAX_SLOPE:
slope_for_all = slope_deskew
slope = slope_for_all
mask_only_con_region = np.zeros(textline_mask_tot_ea.shape)
mask_only_con_region = cv2.fillPoly(mask_only_con_region, pts=[contour_par], color=(1, 1, 1))
# plt.imshow(mask_only_con_region)
# plt.show()
all_text_region_raw = textline_mask_tot_ea[y: y + h, x: x + w].copy()
mask_only_con_region = mask_only_con_region[y: y + h, x: x + w]
##plt.imshow(textline_mask_tot_ea)
##plt.show()
##plt.imshow(all_text_region_raw)
##plt.show()
##plt.imshow(mask_only_con_region)
##plt.show()
all_text_region_raw[mask_only_con_region == 0] = 0
cnt_clean_rot = textline_contours_postprocessing(all_text_region_raw, slope_for_all, contour_par, box_text)
return cnt_clean_rot, box_text, contour, contour_par, crop_coor, index_r_con, slope
def do_work_of_slopes_new_curved(
box_text, contour, contour_par, index_r_con,
textline_mask_tot_ea, image_page_rotated, mask_texts_only, num_col, scale_par, slope_deskew,
logger, MAX_SLOPE=999, KERNEL=None, plotter=None
):
logger.debug("enter do_work_of_slopes_new_curved")
if KERNEL is None:
KERNEL = np.ones((5, 5), np.uint8)
x, y, w, h = box_text
all_text_region_raw = textline_mask_tot_ea[y: y + h, x: x + w].astype(np.uint8)
img_int_p = all_text_region_raw[:, :]
# img_int_p=cv2.erode(img_int_p,KERNEL,iterations = 2)
# plt.imshow(img_int_p)
# plt.show()
if img_int_p.shape[0] / img_int_p.shape[1] < 0.1:
slope = 0
slope_for_all = slope_deskew
else:
try:
textline_con, hierarchy = return_contours_of_image(img_int_p)
textline_con_fil = filter_contours_area_of_image(img_int_p, textline_con, hierarchy, max_area=1, min_area=0.0008)
y_diff_mean = find_contours_mean_y_diff(textline_con_fil)
if np.isnan(y_diff_mean):
slope_for_all = MAX_SLOPE
else:
sigma_des = max(1, int(y_diff_mean * (4.0 / 40.0)))
img_int_p[img_int_p > 0] = 1
slope_for_all = return_deskew_slop(img_int_p, sigma_des, plotter=plotter)
if abs(slope_for_all) < 0.5:
slope_for_all = slope_deskew
except Exception as why:
logger.error(why)
slope_for_all = MAX_SLOPE
if slope_for_all == MAX_SLOPE:
slope_for_all = slope_deskew
slope = slope_for_all
_, crop_coor = crop_image_inside_box(box_text, image_page_rotated)
if abs(slope_for_all) < 45:
textline_region_in_image = np.zeros(textline_mask_tot_ea.shape)
x, y, w, h = cv2.boundingRect(contour_par)
mask_biggest = np.zeros(mask_texts_only.shape)
mask_biggest = cv2.fillPoly(mask_biggest, pts=[contour_par], color=(1, 1, 1))
mask_region_in_patch_region = mask_biggest[y : y + h, x : x + w]
textline_biggest_region = mask_biggest * textline_mask_tot_ea
# print(slope_for_all,'slope_for_all')
textline_rotated_separated = separate_lines_new2(textline_biggest_region[y: y+h, x: x+w], 0, num_col, slope_for_all,
plotter=plotter)
# new line added
##print(np.shape(textline_rotated_separated),np.shape(mask_biggest))
textline_rotated_separated[mask_region_in_patch_region[:, :] != 1] = 0
# till here
textline_region_in_image[y : y + h, x : x + w] = textline_rotated_separated
# plt.imshow(textline_region_in_image)
# plt.show()
pixel_img = 1
cnt_textlines_in_image = return_contours_of_interested_textline(textline_region_in_image, pixel_img)
textlines_cnt_per_region = []
for jjjj in range(len(cnt_textlines_in_image)):
mask_biggest2 = np.zeros(mask_texts_only.shape)
mask_biggest2 = cv2.fillPoly(mask_biggest2, pts=[cnt_textlines_in_image[jjjj]], color=(1, 1, 1))
if num_col + 1 == 1:
mask_biggest2 = cv2.dilate(mask_biggest2, KERNEL, iterations=5)
else:
mask_biggest2 = cv2.dilate(mask_biggest2, KERNEL, iterations=4)
pixel_img = 1
mask_biggest2 = resize_image(mask_biggest2, int(mask_biggest2.shape[0] * scale_par), int(mask_biggest2.shape[1] * scale_par))
cnt_textlines_in_image_ind = return_contours_of_interested_textline(mask_biggest2, pixel_img)
try:
textlines_cnt_per_region.append(cnt_textlines_in_image_ind[0])
except Exception as why:
logger.error(why)
else:
textlines_cnt_per_region = textline_contours_postprocessing(all_text_region_raw, slope_for_all, contour_par, box_text, True)
# print(np.shape(textlines_cnt_per_region),'textlines_cnt_per_region')
return textlines_cnt_per_region[::-1], box_text, contour, contour_par, crop_coor, index_r_con, slope
def do_work_of_slopes_new_light(
box_text, contour, contour_par, index_r_con,
textline_mask_tot_ea, image_page_rotated, slope_deskew,
logger
):
logger.debug('enter do_work_of_slopes_new_light')
x, y, w, h = box_text
_, crop_coor = crop_image_inside_box(box_text, image_page_rotated)
mask_textline = np.zeros(textline_mask_tot_ea.shape)
mask_textline = cv2.fillPoly(mask_textline, pts=[contour], color=(1,1,1))
all_text_region_raw = textline_mask_tot_ea * mask_textline
all_text_region_raw = all_text_region_raw[y: y + h, x: x + w].astype(np.uint8)
mask_only_con_region = np.zeros(textline_mask_tot_ea.shape)
mask_only_con_region = cv2.fillPoly(mask_only_con_region, pts=[contour_par], color=(1, 1, 1))
if self.textline_light:
all_text_region_raw = np.copy(textline_mask_tot_ea)
all_text_region_raw[mask_only_con_region == 0] = 0
cnt_clean_rot_raw, hir_on_cnt_clean_rot = return_contours_of_image(all_text_region_raw)
cnt_clean_rot = filter_contours_area_of_image(all_text_region_raw, cnt_clean_rot_raw, hir_on_cnt_clean_rot,
max_area=1, min_area=0.00001)
else:
all_text_region_raw = np.copy(textline_mask_tot_ea[y: y + h, x: x + w])
mask_only_con_region = mask_only_con_region[y: y + h, x: x + w]
all_text_region_raw[mask_only_con_region == 0] = 0
cnt_clean_rot = textline_contours_postprocessing(all_text_region_raw, slope_deskew, contour_par, box_text)
return cnt_clean_rot, box_text, contour, contour_par, crop_coor, index_r_con, slope

Loading…
Cancel
Save