|
|
@ -28,6 +28,7 @@ import tensorflow as tf
|
|
|
|
tf.get_logger().setLevel("ERROR")
|
|
|
|
tf.get_logger().setLevel("ERROR")
|
|
|
|
warnings.filterwarnings("ignore")
|
|
|
|
warnings.filterwarnings("ignore")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from .utils.contour import (
|
|
|
|
from .utils.contour import (
|
|
|
|
filter_contours_area_of_image,
|
|
|
|
filter_contours_area_of_image,
|
|
|
|
find_contours_mean_y_diff,
|
|
|
|
find_contours_mean_y_diff,
|
|
|
@ -110,7 +111,7 @@ class Eynollah:
|
|
|
|
dir_of_cropped_images=dir_of_cropped_images,
|
|
|
|
dir_of_cropped_images=dir_of_cropped_images,
|
|
|
|
dir_of_layout=dir_of_layout,
|
|
|
|
dir_of_layout=dir_of_layout,
|
|
|
|
image_filename=image_filename,
|
|
|
|
image_filename=image_filename,
|
|
|
|
image_filename_stem=image_filename_stem)
|
|
|
|
image_filename_stem=self.image_filename_stem)
|
|
|
|
self.writer = EynollahXmlWriter(
|
|
|
|
self.writer = EynollahXmlWriter(
|
|
|
|
dir_out=self.dir_out,
|
|
|
|
dir_out=self.dir_out,
|
|
|
|
image_filename=self.image_filename,
|
|
|
|
image_filename=self.image_filename,
|
|
|
@ -336,7 +337,10 @@ class Eynollah:
|
|
|
|
|
|
|
|
|
|
|
|
def resize_and_enhance_image_with_column_classifier(self):
|
|
|
|
def resize_and_enhance_image_with_column_classifier(self):
|
|
|
|
self.logger.debug("enter resize_and_enhance_image_with_column_classifier")
|
|
|
|
self.logger.debug("enter resize_and_enhance_image_with_column_classifier")
|
|
|
|
|
|
|
|
try:
|
|
|
|
dpi = check_dpi(self.image_filename)
|
|
|
|
dpi = check_dpi(self.image_filename)
|
|
|
|
|
|
|
|
except:
|
|
|
|
|
|
|
|
dpi = 230
|
|
|
|
self.logger.info("Detected %s DPI", dpi)
|
|
|
|
self.logger.info("Detected %s DPI", dpi)
|
|
|
|
img = self.imread()
|
|
|
|
img = self.imread()
|
|
|
|
|
|
|
|
|
|
|
@ -705,13 +709,12 @@ class Eynollah:
|
|
|
|
|
|
|
|
|
|
|
|
def get_slopes_and_deskew_new(self, contours, contours_par, textline_mask_tot, image_page_rotated, boxes, slope_deskew):
|
|
|
|
def get_slopes_and_deskew_new(self, contours, contours_par, textline_mask_tot, image_page_rotated, boxes, slope_deskew):
|
|
|
|
self.logger.debug("enter get_slopes_and_deskew_new")
|
|
|
|
self.logger.debug("enter get_slopes_and_deskew_new")
|
|
|
|
num_cores = cpu_count()
|
|
|
|
num_cores = 1#cpu_count()
|
|
|
|
queue_of_all_params = Queue()
|
|
|
|
queue_of_all_params = Queue()
|
|
|
|
|
|
|
|
|
|
|
|
processes = []
|
|
|
|
processes = []
|
|
|
|
nh = np.linspace(0, len(boxes), num_cores + 1)
|
|
|
|
nh = np.linspace(0, len(boxes), num_cores + 1)
|
|
|
|
indexes_by_text_con = np.array(range(len(contours_par)))
|
|
|
|
indexes_by_text_con = np.array(range(len(contours_par)))
|
|
|
|
|
|
|
|
|
|
|
|
for i in range(num_cores):
|
|
|
|
for i in range(num_cores):
|
|
|
|
boxes_per_process = boxes[int(nh[i]) : int(nh[i + 1])]
|
|
|
|
boxes_per_process = boxes[int(nh[i]) : int(nh[i + 1])]
|
|
|
|
contours_per_process = contours[int(nh[i]) : int(nh[i + 1])]
|
|
|
|
contours_per_process = contours[int(nh[i]) : int(nh[i + 1])]
|
|
|
@ -719,7 +722,6 @@ class Eynollah:
|
|
|
|
indexes_text_con_per_process = indexes_by_text_con[int(nh[i]) : int(nh[i + 1])]
|
|
|
|
indexes_text_con_per_process = indexes_by_text_con[int(nh[i]) : int(nh[i + 1])]
|
|
|
|
|
|
|
|
|
|
|
|
processes.append(Process(target=self.do_work_of_slopes_new, args=(queue_of_all_params, boxes_per_process, textline_mask_tot, contours_per_process, contours_par_per_process, indexes_text_con_per_process, image_page_rotated, slope_deskew)))
|
|
|
|
processes.append(Process(target=self.do_work_of_slopes_new, args=(queue_of_all_params, boxes_per_process, textline_mask_tot, contours_per_process, contours_par_per_process, indexes_text_con_per_process, image_page_rotated, slope_deskew)))
|
|
|
|
|
|
|
|
|
|
|
|
for i in range(num_cores):
|
|
|
|
for i in range(num_cores):
|
|
|
|
processes[i].start()
|
|
|
|
processes[i].start()
|
|
|
|
|
|
|
|
|
|
|
@ -730,7 +732,6 @@ class Eynollah:
|
|
|
|
boxes = []
|
|
|
|
boxes = []
|
|
|
|
all_box_coord = []
|
|
|
|
all_box_coord = []
|
|
|
|
all_index_text_con = []
|
|
|
|
all_index_text_con = []
|
|
|
|
|
|
|
|
|
|
|
|
for i in range(num_cores):
|
|
|
|
for i in range(num_cores):
|
|
|
|
list_all_par = queue_of_all_params.get(True)
|
|
|
|
list_all_par = queue_of_all_params.get(True)
|
|
|
|
slopes_for_sub_process = list_all_par[0]
|
|
|
|
slopes_for_sub_process = list_all_par[0]
|
|
|
@ -748,7 +749,6 @@ class Eynollah:
|
|
|
|
all_found_text_regions_par.append(contours_par_for_subprocess[j])
|
|
|
|
all_found_text_regions_par.append(contours_par_for_subprocess[j])
|
|
|
|
all_box_coord.append(boxes_coord_for_subprocess[j])
|
|
|
|
all_box_coord.append(boxes_coord_for_subprocess[j])
|
|
|
|
all_index_text_con.append(indexes_for_subprocess[j])
|
|
|
|
all_index_text_con.append(indexes_for_subprocess[j])
|
|
|
|
|
|
|
|
|
|
|
|
for i in range(num_cores):
|
|
|
|
for i in range(num_cores):
|
|
|
|
processes[i].join()
|
|
|
|
processes[i].join()
|
|
|
|
self.logger.debug('slopes %s', slopes)
|
|
|
|
self.logger.debug('slopes %s', slopes)
|
|
|
@ -918,7 +918,6 @@ class Eynollah:
|
|
|
|
|
|
|
|
|
|
|
|
def do_work_of_slopes_new(self, queue_of_all_params, boxes_text, textline_mask_tot_ea, contours_per_process, contours_par_per_process, indexes_r_con_per_pro, image_page_rotated, slope_deskew):
|
|
|
|
def do_work_of_slopes_new(self, queue_of_all_params, boxes_text, textline_mask_tot_ea, contours_per_process, contours_par_per_process, indexes_r_con_per_pro, image_page_rotated, slope_deskew):
|
|
|
|
self.logger.debug('enter do_work_of_slopes_new')
|
|
|
|
self.logger.debug('enter do_work_of_slopes_new')
|
|
|
|
|
|
|
|
|
|
|
|
slopes_per_each_subprocess = []
|
|
|
|
slopes_per_each_subprocess = []
|
|
|
|
bounding_box_of_textregion_per_each_subprocess = []
|
|
|
|
bounding_box_of_textregion_per_each_subprocess = []
|
|
|
|
textlines_rectangles_per_each_subprocess = []
|
|
|
|
textlines_rectangles_per_each_subprocess = []
|
|
|
@ -926,7 +925,6 @@ class Eynollah:
|
|
|
|
contours_textregion_par_per_each_subprocess = []
|
|
|
|
contours_textregion_par_per_each_subprocess = []
|
|
|
|
all_box_coord_per_process = []
|
|
|
|
all_box_coord_per_process = []
|
|
|
|
index_by_text_region_contours = []
|
|
|
|
index_by_text_region_contours = []
|
|
|
|
|
|
|
|
|
|
|
|
for mv in range(len(boxes_text)):
|
|
|
|
for mv in range(len(boxes_text)):
|
|
|
|
_, crop_coor = crop_image_inside_box(boxes_text[mv],image_page_rotated)
|
|
|
|
_, crop_coor = crop_image_inside_box(boxes_text[mv],image_page_rotated)
|
|
|
|
mask_textline = np.zeros((textline_mask_tot_ea.shape))
|
|
|
|
mask_textline = np.zeros((textline_mask_tot_ea.shape))
|
|
|
@ -959,7 +957,6 @@ class Eynollah:
|
|
|
|
except Exception as why:
|
|
|
|
except Exception as why:
|
|
|
|
self.logger.error(why)
|
|
|
|
self.logger.error(why)
|
|
|
|
slope_for_all = MAX_SLOPE
|
|
|
|
slope_for_all = MAX_SLOPE
|
|
|
|
|
|
|
|
|
|
|
|
if slope_for_all == MAX_SLOPE:
|
|
|
|
if slope_for_all == MAX_SLOPE:
|
|
|
|
slope_for_all = [slope_deskew][0]
|
|
|
|
slope_for_all = [slope_deskew][0]
|
|
|
|
slopes_per_each_subprocess.append(slope_for_all)
|
|
|
|
slopes_per_each_subprocess.append(slope_for_all)
|
|
|
@ -988,7 +985,6 @@ class Eynollah:
|
|
|
|
contours_textregion_per_each_subprocess.append(contours_per_process[mv])
|
|
|
|
contours_textregion_per_each_subprocess.append(contours_per_process[mv])
|
|
|
|
contours_textregion_par_per_each_subprocess.append(contours_par_per_process[mv])
|
|
|
|
contours_textregion_par_per_each_subprocess.append(contours_par_per_process[mv])
|
|
|
|
all_box_coord_per_process.append(crop_coor)
|
|
|
|
all_box_coord_per_process.append(crop_coor)
|
|
|
|
|
|
|
|
|
|
|
|
queue_of_all_params.put([slopes_per_each_subprocess, textlines_rectangles_per_each_subprocess, bounding_box_of_textregion_per_each_subprocess, contours_textregion_per_each_subprocess, contours_textregion_par_per_each_subprocess, all_box_coord_per_process, index_by_text_region_contours])
|
|
|
|
queue_of_all_params.put([slopes_per_each_subprocess, textlines_rectangles_per_each_subprocess, bounding_box_of_textregion_per_each_subprocess, contours_textregion_per_each_subprocess, contours_textregion_par_per_each_subprocess, all_box_coord_per_process, index_by_text_region_contours])
|
|
|
|
|
|
|
|
|
|
|
|
def textline_contours(self, img, patches, scaler_h, scaler_w):
|
|
|
|
def textline_contours(self, img, patches, scaler_h, scaler_w):
|
|
|
@ -1596,6 +1592,9 @@ class Eynollah:
|
|
|
|
|
|
|
|
|
|
|
|
t0 = time.time()
|
|
|
|
t0 = time.time()
|
|
|
|
img_res, is_image_enhanced, num_col_classifier, num_column_is_classified = self.run_enhancement()
|
|
|
|
img_res, is_image_enhanced, num_col_classifier, num_column_is_classified = self.run_enhancement()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
self.logger.info("Enhancing took %ss ", str(time.time() - t0))
|
|
|
|
self.logger.info("Enhancing took %ss ", str(time.time() - t0))
|
|
|
|
|
|
|
|
|
|
|
|
t1 = time.time()
|
|
|
|
t1 = time.time()
|
|
|
@ -1636,8 +1635,6 @@ class Eynollah:
|
|
|
|
|
|
|
|
|
|
|
|
if self.full_layout:
|
|
|
|
if self.full_layout:
|
|
|
|
polygons_of_images, img_revised_tab, text_regions_p_1_n, textline_mask_tot_d, regions_without_seperators_d, regions_fully, regions_without_seperators = self.run_boxes_full_layout(image_page, textline_mask_tot, text_regions_p, slope_deskew, num_col_classifier, img_only_regions)
|
|
|
|
polygons_of_images, img_revised_tab, text_regions_p_1_n, textline_mask_tot_d, regions_without_seperators_d, regions_fully, regions_without_seperators = self.run_boxes_full_layout(image_page, textline_mask_tot, text_regions_p, slope_deskew, num_col_classifier, img_only_regions)
|
|
|
|
# plt.imshow(img_revised_tab)
|
|
|
|
|
|
|
|
# plt.show()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
text_only = ((img_revised_tab[:, :] == 1)) * 1
|
|
|
|
text_only = ((img_revised_tab[:, :] == 1)) * 1
|
|
|
|
if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
|
|
|
|
if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
|
|
|
@ -1723,25 +1720,22 @@ class Eynollah:
|
|
|
|
self.logger.debug('areas_cnt_text_parent %s', areas_cnt_text_parent)
|
|
|
|
self.logger.debug('areas_cnt_text_parent %s', areas_cnt_text_parent)
|
|
|
|
# self.logger.debug('areas_cnt_text_parent_d %s', areas_cnt_text_parent_d)
|
|
|
|
# self.logger.debug('areas_cnt_text_parent_d %s', areas_cnt_text_parent_d)
|
|
|
|
# self.logger.debug('len(contours_only_text_parent) %s', len(contours_only_text_parent_d))
|
|
|
|
# self.logger.debug('len(contours_only_text_parent) %s', len(contours_only_text_parent_d))
|
|
|
|
|
|
|
|
|
|
|
|
txt_con_org = get_textregion_contours_in_org_image(contours_only_text_parent, self.image, slope_first)
|
|
|
|
txt_con_org = get_textregion_contours_in_org_image(contours_only_text_parent, self.image, slope_first)
|
|
|
|
boxes_text, _ = get_text_region_boxes_by_given_contours(contours_only_text_parent)
|
|
|
|
boxes_text, _ = get_text_region_boxes_by_given_contours(contours_only_text_parent)
|
|
|
|
boxes_marginals, _ = get_text_region_boxes_by_given_contours(polygons_of_marginals)
|
|
|
|
boxes_marginals, _ = get_text_region_boxes_by_given_contours(polygons_of_marginals)
|
|
|
|
|
|
|
|
|
|
|
|
if not self.curved_line:
|
|
|
|
if not self.curved_line:
|
|
|
|
slopes, all_found_texline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con = self.get_slopes_and_deskew_new(txt_con_org, contours_only_text_parent, textline_mask_tot_ea, image_page_rotated, boxes_text, slope_deskew)
|
|
|
|
slopes, all_found_texline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con = self.get_slopes_and_deskew_new(txt_con_org, contours_only_text_parent, textline_mask_tot_ea, image_page_rotated, boxes_text, slope_deskew)
|
|
|
|
_, all_found_texline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, _ = self.get_slopes_and_deskew_new(polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea, image_page_rotated, boxes_marginals, slope_deskew)
|
|
|
|
slopes_marginals, all_found_texline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, _ = self.get_slopes_and_deskew_new(polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea, image_page_rotated, boxes_marginals, slope_deskew)
|
|
|
|
|
|
|
|
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
|
|
|
|
|
|
|
|
scale_param = 1
|
|
|
|
scale_param = 1
|
|
|
|
all_found_texline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con, slopes = self.get_slopes_and_deskew_new_curved(txt_con_org, contours_only_text_parent, cv2.erode(textline_mask_tot_ea, kernel=KERNEL, iterations=1), image_page_rotated, boxes_text, text_only, num_col_classifier, scale_param, slope_deskew)
|
|
|
|
all_found_texline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con, slopes = self.get_slopes_and_deskew_new_curved(txt_con_org, contours_only_text_parent, cv2.erode(textline_mask_tot_ea, kernel=KERNEL, iterations=1), image_page_rotated, boxes_text, text_only, num_col_classifier, scale_param, slope_deskew)
|
|
|
|
all_found_texline_polygons = small_textlines_to_parent_adherence2(all_found_texline_polygons, textline_mask_tot_ea, num_col_classifier)
|
|
|
|
all_found_texline_polygons = small_textlines_to_parent_adherence2(all_found_texline_polygons, textline_mask_tot_ea, num_col_classifier)
|
|
|
|
all_found_texline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, _, _ = self.get_slopes_and_deskew_new_curved(polygons_of_marginals, polygons_of_marginals, cv2.erode(textline_mask_tot_ea, kernel=KERNEL, iterations=1), image_page_rotated, boxes_marginals, text_only, num_col_classifier, scale_param, slope_deskew)
|
|
|
|
all_found_texline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, _, slopes_marginals = self.get_slopes_and_deskew_new_curved(polygons_of_marginals, polygons_of_marginals, cv2.erode(textline_mask_tot_ea, kernel=KERNEL, iterations=1), image_page_rotated, boxes_marginals, text_only, num_col_classifier, scale_param, slope_deskew)
|
|
|
|
all_found_texline_polygons_marginals = small_textlines_to_parent_adherence2(all_found_texline_polygons_marginals, textline_mask_tot_ea, num_col_classifier)
|
|
|
|
all_found_texline_polygons_marginals = small_textlines_to_parent_adherence2(all_found_texline_polygons_marginals, textline_mask_tot_ea, num_col_classifier)
|
|
|
|
|
|
|
|
|
|
|
|
K.clear_session()
|
|
|
|
K.clear_session()
|
|
|
|
# print(index_by_text_par_con,'index_by_text_par_con')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if self.full_layout:
|
|
|
|
if self.full_layout:
|
|
|
|
if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
|
|
|
|
if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
|
|
|
|
contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered)[index_by_text_par_con])
|
|
|
|
contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered)[index_by_text_par_con])
|
|
|
@ -1809,7 +1803,7 @@ class Eynollah:
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered, boxes_d, textline_mask_tot_d)
|
|
|
|
order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered, boxes_d, textline_mask_tot_d)
|
|
|
|
|
|
|
|
|
|
|
|
pcgts = self.writer.build_pagexml_full_layout(contours_only_text_parent, contours_only_text_parent_h, page_coord, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, polygons_of_images, polygons_of_tabels, polygons_of_drop_capitals, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, cont_page)
|
|
|
|
pcgts = self.writer.build_pagexml_full_layout(contours_only_text_parent, contours_only_text_parent_h, page_coord, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, polygons_of_images, polygons_of_tabels, polygons_of_drop_capitals, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, cont_page)
|
|
|
|
self.logger.info("Job done in %ss", str(time.time() - t0))
|
|
|
|
self.logger.info("Job done in %ss", str(time.time() - t0))
|
|
|
|
return pcgts
|
|
|
|
return pcgts
|
|
|
|
else:
|
|
|
|
else:
|
|
|
@ -1819,6 +1813,6 @@ class Eynollah:
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered)[index_by_text_par_con])
|
|
|
|
contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered)[index_by_text_par_con])
|
|
|
|
order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h, boxes_d, textline_mask_tot_d)
|
|
|
|
order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h, boxes_d, textline_mask_tot_d)
|
|
|
|
pcgts = self.writer.build_pagexml_no_full_layout(txt_con_org, page_coord, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, cont_page)
|
|
|
|
pcgts = self.writer.build_pagexml_no_full_layout(txt_con_org, page_coord, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, cont_page)
|
|
|
|
self.logger.info("Job done in %ss", str(time.time() - t0))
|
|
|
|
self.logger.info("Job done in %ss", str(time.time() - t0))
|
|
|
|
return pcgts
|
|
|
|
return pcgts
|
|
|
|