diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index ae292c6..8c92b92 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -13,7 +13,8 @@ import time import warnings from functools import partial from pathlib import Path -from multiprocessing import Pool, cpu_count +from multiprocessing import cpu_count +from concurrent.futures import ProcessPoolExecutor import gc from ocrd_utils import getLogger import cv2 @@ -251,6 +252,8 @@ class Eynollah: textline_light = self.textline_light, pcgts=pcgts) self.logger = logger if logger else getLogger('eynollah') + # for parallelization of CPU-intensive tasks: + self.executor = ProcessPoolExecutor(max_workers=cpu_count()) self.dir_models = dir_models self.model_dir_of_enhancement = dir_models + "/eynollah-enhancement_20210425" self.model_dir_of_binarization = dir_models + "/eynollah-binarization_20210425" @@ -1518,21 +1521,15 @@ class Eynollah: if not len(contours): return [], [], [], [], [], [], [] self.logger.debug("enter get_slopes_and_deskew_new_light") - if len(contours)>15: - num_cores = cpu_count() - else: - num_cores = 1 - with Pool(processes=num_cores) as pool: - results = pool.starmap( - partial(do_work_of_slopes_new_light, - textline_mask_tot_ea=textline_mask_tot, - image_page_rotated=image_page_rotated, - slope_deskew=slope_deskew, - logger=self.logger, - MAX_SLOPE=MAX_SLOPE, - KERNEL=KERNEL, - plotter=self.plotter,), - zip(boxes, contours, contours_par, range(len(contours_par)))) + results = self.executor.map(partial(do_work_of_slopes_new_light, + textline_mask_tot_ea=textline_mask_tot, + image_page_rotated=image_page_rotated, + slope_deskew=slope_deskew, + MAX_SLOPE=MAX_SLOPE, + KERNEL=KERNEL, + logger=self.logger, + plotter=self.plotter,), + boxes, contours, contours_par, range(len(contours_par))) #textline_polygons, boxes, text_regions, text_regions_par, box_coord, index_text_con, slopes = zip(*results) self.logger.debug("exit get_slopes_and_deskew_new_light") return tuple(zip(*results)) @@ -1541,18 +1538,15 @@ class Eynollah: if not len(contours): return [], [], [], [], [], [], [] self.logger.debug("enter get_slopes_and_deskew_new") - num_cores = cpu_count() - with Pool(processes=num_cores) as pool: - results = pool.starmap( - partial(do_work_of_slopes_new, - textline_mask_tot_ea=textline_mask_tot, - image_page_rotated=image_page_rotated, - slope_deskew=slope_deskew, - logger=self.logger, - MAX_SLOPE=MAX_SLOPE, - KERNEL=KERNEL, - plotter=self.plotter,), - zip(boxes, contours, contours_par, range(len(contours_par)))) + results = self.executor.map(partial(do_work_of_slopes_new, + textline_mask_tot_ea=textline_mask_tot, + image_page_rotated=image_page_rotated, + slope_deskew=slope_deskew, + MAX_SLOPE=MAX_SLOPE, + KERNEL=KERNEL, + logger=self.logger, + plotter=self.plotter,), + boxes, contours, contours_par, range(len(contours_par))) #textline_polygons, boxes, text_regions, text_regions_par, box_coord, index_text_con, slopes = zip(*results) self.logger.debug("exit get_slopes_and_deskew_new") return tuple(zip(*results)) @@ -1561,21 +1555,18 @@ class Eynollah: if not len(contours): return [], [], [], [], [], [], [] self.logger.debug("enter get_slopes_and_deskew_new_curved") - num_cores = cpu_count() - with Pool(processes=num_cores) as pool: - results = pool.starmap( - partial(do_work_of_slopes_new_curved, - textline_mask_tot_ea=textline_mask_tot, - image_page_rotated=image_page_rotated, - mask_texts_only=mask_texts_only, - num_col=num_col, - scale_par=scale_par, - slope_deskew=slope_deskew, - logger=self.logger, - MAX_SLOPE=MAX_SLOPE, - KERNEL=KERNEL, - plotter=self.plotter,), - zip(boxes, contours, contours_par, range(len(contours_par)))) + results = self.executor.map(partial(do_work_of_slopes_new_curved, + textline_mask_tot_ea=textline_mask_tot, + image_page_rotated=image_page_rotated, + mask_texts_only=mask_texts_only, + num_col=num_col, + scale_par=scale_par, + slope_deskew=slope_deskew, + MAX_SLOPE=MAX_SLOPE, + KERNEL=KERNEL, + logger=self.logger, + plotter=self.plotter,), + boxes, contours, contours_par, range(len(contours_par))) #textline_polygons, boxes, text_regions, text_regions_par, box_coord, index_text_con, slopes = zip(*results) self.logger.debug("exit get_slopes_and_deskew_new_curved") return tuple(zip(*results)) @@ -1643,7 +1634,8 @@ class Eynollah: y_diff_mean = find_contours_mean_y_diff(textline_con_fil) sigma_des = max(1, int(y_diff_mean * (4.0 / 40.0))) crop_img[crop_img > 0] = 1 - slope_corresponding_textregion = return_deskew_slop(crop_img, sigma_des, plotter=self.plotter) + slope_corresponding_textregion = return_deskew_slop(crop_img, sigma_des, + map=self.executor.map, logger=self.logger, plotter=self.plotter) except Exception as why: self.logger.error(why) slope_corresponding_textregion = MAX_SLOPE @@ -2932,10 +2924,8 @@ class Eynollah: def run_deskew(self, textline_mask_tot_ea): #print(textline_mask_tot_ea.shape, 'textline_mask_tot_ea deskew') - sigma = 2 - main_page_deskew = True - n_total_angles = 30 - slope_deskew = return_deskew_slop(cv2.erode(textline_mask_tot_ea, KERNEL, iterations=2), sigma, n_total_angles, main_page_deskew, plotter=self.plotter) + slope_deskew = return_deskew_slop(cv2.erode(textline_mask_tot_ea, KERNEL, iterations=2), 2, 30, True, + map=self.executor.map, logger=self.logger, plotter=self.plotter) slope_first = 0 if self.plotter: @@ -4748,7 +4738,7 @@ class Eynollah: contours_only_text_parent = self.dilate_textregions_contours(contours_only_text_parent) contours_only_text_parent = self.filter_contours_inside_a_bigger_one(contours_only_text_parent, text_only, marginal_cnts=polygons_of_marginals) #print("text region early 3.5 in %.1fs", time.time() - t0) - txt_con_org = get_textregion_contours_in_org_image_light(contours_only_text_parent, self.image, slope_first) + txt_con_org = get_textregion_contours_in_org_image_light(contours_only_text_parent, self.image, slope_first, map=self.executor.map) #txt_con_org = self.dilate_textregions_contours(txt_con_org) #contours_only_text_parent = self.dilate_textregions_contours(contours_only_text_parent) else: diff --git a/src/eynollah/utils/contour.py b/src/eynollah/utils/contour.py index 65331c2..e47c5e7 100644 --- a/src/eynollah/utils/contour.py +++ b/src/eynollah/utils/contour.py @@ -1,5 +1,4 @@ from functools import partial -from multiprocessing import cpu_count, Pool import cv2 import numpy as np from shapely import geometry @@ -162,17 +161,14 @@ def do_work_of_contours_in_image(contour, index_r_con, img, slope_first): return cont_int[0], index_r_con -def get_textregion_contours_in_org_image_multi(cnts, img, slope_first): +def get_textregion_contours_in_org_image_multi(cnts, img, slope_first, map=map): if not len(cnts): return [], [] - num_cores = cpu_count() - with Pool(processes=num_cores) as pool: - results = pool.starmap( - partial(do_work_of_contours_in_image, - img=img, - slope_first=slope_first, - ), - zip(cnts, range(len(cnts)))) + results = map(partial(do_work_of_contours_in_image, + img=img, + slope_first=slope_first, + ), + cnts, range(len(cnts))) return tuple(zip(*results)) def get_textregion_contours_in_org_image(cnts, img, slope_first): @@ -252,21 +248,18 @@ def do_back_rotation_and_get_cnt_back(contour_par, index_r_con, img, slope_first # print(np.shape(cont_int[0])) return cont_int[0], index_r_con -def get_textregion_contours_in_org_image_light(cnts, img, slope_first): +def get_textregion_contours_in_org_image_light(cnts, img, slope_first, map=map): if not len(cnts): return [] img = cv2.resize(img, (int(img.shape[1]/6), int(img.shape[0]/6)), interpolation=cv2.INTER_NEAREST) ##cnts = list( (np.array(cnts)/2).astype(np.int16) ) #cnts = cnts/2 cnts = [(i/6).astype(np.int) for i in cnts] - num_cores = cpu_count() - with Pool(processes=num_cores) as pool: - results = pool.starmap( - partial(do_back_rotation_and_get_cnt_back, - img=img, - slope_first=slope_first, - ), - zip(cnts, range(len(cnts)))) + results = map(partial(do_back_rotation_and_get_cnt_back, + img=img, + slope_first=slope_first, + ), + cnts, range(len(cnts))) contours, indexes = tuple(zip(*results)) return [i*6 for i in contours] diff --git a/src/eynollah/utils/separate_lines.py b/src/eynollah/utils/separate_lines.py index 922fa14..48e1c5b 100644 --- a/src/eynollah/utils/separate_lines.py +++ b/src/eynollah/utils/separate_lines.py @@ -1,6 +1,6 @@ import os +from logging import getLogger from functools import partial -from multiprocessing import Pool, cpu_count import numpy as np import cv2 from scipy.signal import find_peaks @@ -1464,7 +1464,9 @@ def textline_contours_postprocessing(textline_mask, slope, contour_text_interest return contours_rotated_clean -def separate_lines_new2(img_path, thetha, num_col, slope_region, plotter=None): +def separate_lines_new2(img_path, thetha, num_col, slope_region, logger=None, plotter=None): + if logger is None: + logger = getLogger(__package__) if num_col == 1: num_patches = int(img_path.shape[1] / 200.0) @@ -1572,18 +1574,20 @@ def separate_lines_new2(img_path, thetha, num_col, slope_region, plotter=None): # plt.show() return img_patch_ineterst_revised -def do_image_rotation(angle, img, sigma_des): - print(f"rotating image by {angle}") +def do_image_rotation(angle, img, sigma_des, logger=None): + if logger is None: + logger = getLogger(__package__) img_rot = rotate_image(img, angle) img_rot[img_rot!=0] = 1 try: var = find_num_col_deskew(img_rot, sigma_des, 20.3) except: + logger.exception("cannot determine variance for angle %.2f°", angle) var = 0 return var -def return_deskew_slop(img_patch_org, sigma_des,n_tot_angles=100, main_page=False, plotter=None): - num_cores = cpu_count() +def return_deskew_slop(img_patch_org, sigma_des,n_tot_angles=100, + main_page=False, logger=None, plotter=None, map=map): if main_page and plotter: plotter.save_plot_of_textline_density(img_patch_org) @@ -1615,16 +1619,16 @@ def return_deskew_slop(img_patch_org, sigma_des,n_tot_angles=100, main_page=Fals #plt.imshow(img_resized) #plt.show() angles = np.array([-45, 0, 45, 90,]) - angle = get_smallest_skew(img_resized, sigma_des, angles, num_cores=num_cores, plotter=plotter) + angle = get_smallest_skew(img_resized, sigma_des, angles, map=map, logger=logger, plotter=plotter) angles = np.linspace(angle - 22.5, angle + 22.5, n_tot_angles) - angle = get_smallest_skew(img_resized, sigma_des, angles, num_cores=num_cores, plotter=plotter) + angle = get_smallest_skew(img_resized, sigma_des, angles, map=map, logger=logger, plotter=plotter) elif main_page: #plt.imshow(img_resized) #plt.show() angles = np.linspace(-12, 12, n_tot_angles)#np.array([0 , 45 , 90 , -45]) - angle = get_smallest_skew(img_resized, sigma_des, angles, num_cores=num_cores, plotter=plotter) + angle = get_smallest_skew(img_resized, sigma_des, angles, map=map, logger=logger, plotter=plotter) early_slope_edge=11 if abs(angle) > early_slope_edge: @@ -1632,11 +1636,11 @@ def return_deskew_slop(img_patch_org, sigma_des,n_tot_angles=100, main_page=Fals angles = np.linspace(-90, -12, n_tot_angles) else: angles = np.linspace(90, 12, n_tot_angles) - angle = get_smallest_skew(img_resized, sigma_des, angles, num_cores=num_cores, plotter=plotter) + angle = get_smallest_skew(img_resized, sigma_des, angles, map=map, logger=logger, plotter=plotter) else: angles = np.linspace(-25, 25, int(0.5 * n_tot_angles) + 10) - angle = get_smallest_skew(img_resized, sigma_des, angles, num_cores=num_cores, plotter=plotter) + angle = get_smallest_skew(img_resized, sigma_des, angles, map=map, logger=logger, plotter=plotter) early_slope_edge=22 if abs(angle) > early_slope_edge: @@ -1644,30 +1648,35 @@ def return_deskew_slop(img_patch_org, sigma_des,n_tot_angles=100, main_page=Fals angles = np.linspace(-90, -25, int(0.5 * n_tot_angles) + 10) else: angles = np.linspace(90, 25, int(0.5 * n_tot_angles) + 10) - angle = get_smallest_skew(img_resized, sigma_des, angles, num_cores=num_cores, plotter=plotter) + angle = get_smallest_skew(img_resized, sigma_des, angles, map=map, logger=logger, plotter=plotter) return angle -def get_smallest_skew(img, sigma_des, angles, num_cores=1, plotter=None): - with Pool(processes=num_cores) as pool: - results = pool.map(partial(do_image_rotation, img=img, sigma_des=sigma_des), angles) +def get_smallest_skew(img, sigma_des, angles, logger=None, plotter=None, map=map): + if logger is None: + logger = getLogger(__package__) + results = list(map(partial(do_image_rotation, img=img, sigma_des=sigma_des, logger=logger), angles)) if plotter: plotter.save_plot_of_rotation_angle(angles, results) try: var_res = np.array(results) + assert var_res.any() angle = angles[np.argmax(var_res)] except: + logger.exception("cannot determine best angle among %s", str(angles)) angle = 0 return angle def do_work_of_slopes_new( box_text, contour, contour_par, index_r_con, textline_mask_tot_ea, image_page_rotated, slope_deskew, - logger, MAX_SLOPE=999, KERNEL=None, plotter=None + logger=None, MAX_SLOPE=999, KERNEL=None, plotter=None ): - logger.debug('enter do_work_of_slopes_new') if KERNEL is None: KERNEL = np.ones((5, 5), np.uint8) + if logger is None: + logger = getLogger(__package__) + logger.debug('enter do_work_of_slopes_new') x, y, w, h = box_text _, crop_coor = crop_image_inside_box(box_text, image_page_rotated) @@ -1693,11 +1702,11 @@ def do_work_of_slopes_new( else: sigma_des = max(1, int(y_diff_mean * (4.0 / 40.0))) img_int_p[img_int_p > 0] = 1 - slope_for_all = return_deskew_slop(img_int_p, sigma_des, plotter=plotter) + slope_for_all = return_deskew_slop(img_int_p, sigma_des, logger=logger, plotter=plotter) if abs(slope_for_all) <= 0.5: slope_for_all = slope_deskew - except Exception as why: - logger.error(why) + except: + logger.exception("cannot determine angle of contours") slope_for_all = MAX_SLOPE if slope_for_all == MAX_SLOPE: @@ -1728,11 +1737,13 @@ def do_work_of_slopes_new( def do_work_of_slopes_new_curved( box_text, contour, contour_par, index_r_con, textline_mask_tot_ea, image_page_rotated, mask_texts_only, num_col, scale_par, slope_deskew, - logger, MAX_SLOPE=999, KERNEL=None, plotter=None + logger=None, MAX_SLOPE=999, KERNEL=None, plotter=None ): - logger.debug("enter do_work_of_slopes_new_curved") if KERNEL is None: KERNEL = np.ones((5, 5), np.uint8) + if logger is None: + logger = getLogger(__package__) + logger.debug("enter do_work_of_slopes_new_curved") x, y, w, h = box_text all_text_region_raw = textline_mask_tot_ea[y: y + h, x: x + w].astype(np.uint8) @@ -1755,11 +1766,11 @@ def do_work_of_slopes_new_curved( else: sigma_des = max(1, int(y_diff_mean * (4.0 / 40.0))) img_int_p[img_int_p > 0] = 1 - slope_for_all = return_deskew_slop(img_int_p, sigma_des, plotter=plotter) + slope_for_all = return_deskew_slop(img_int_p, sigma_des, logger=logger, plotter=plotter) if abs(slope_for_all) < 0.5: slope_for_all = slope_deskew - except Exception as why: - logger.error(why) + except: + logger.exception("cannot determine angle of contours") slope_for_all = MAX_SLOPE if slope_for_all == MAX_SLOPE: @@ -1778,7 +1789,7 @@ def do_work_of_slopes_new_curved( # print(slope_for_all,'slope_for_all') textline_rotated_separated = separate_lines_new2(textline_biggest_region[y: y+h, x: x+w], 0, num_col, slope_for_all, - plotter=plotter) + logger=logger, plotter=plotter) # new line added ##print(np.shape(textline_rotated_separated),np.shape(mask_biggest)) @@ -1818,8 +1829,10 @@ def do_work_of_slopes_new_curved( def do_work_of_slopes_new_light( box_text, contour, contour_par, index_r_con, textline_mask_tot_ea, image_page_rotated, slope_deskew, - logger + logger=None ): + if logger is None: + logger = getLogger(__package__) logger.debug('enter do_work_of_slopes_new_light') x, y, w, h = box_text