Merge pull request #116 from qurator-spk/fix-typos

fix some typos
pull/117/head
vahidrezanezhad 1 year ago committed by GitHub
commit 0ea90b7509
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -63,7 +63,7 @@ The following options can be used to further configure the processing:
| `-tab` | apply table detection | | `-tab` | apply table detection |
| `-ae` | apply enhancement (the resulting image is saved to the output directory) | | `-ae` | apply enhancement (the resulting image is saved to the output directory) |
| `-as` | apply scaling | | `-as` | apply scaling |
| `-cl` | apply countour detection for curved text lines instead of bounding boxes | | `-cl` | apply contour detection for curved text lines instead of bounding boxes |
| `-ib` | apply binarization (the resulting image is saved to the output directory) | | `-ib` | apply binarization (the resulting image is saved to the output directory) |
| `-ep` | enable plotting (MUST always be used with `-sl`, `-sd`, `-sa`, `-si` or `-ae`) | | `-ep` | enable plotting (MUST always be used with `-sl`, `-sd`, `-sa`, `-si` or `-ae`) |
| `-ho` | ignore headers for reading order dectection | | `-ho` | ignore headers for reading order dectection |
@ -99,7 +99,7 @@ uses the original (RGB) image despite any binarization that may have occured in
If you find this tool useful in your work, please consider citing our paper: If you find this tool useful in your work, please consider citing our paper:
```bibtex ```bibtex
@inproceedings{rezanezhad2023documentlayoutanalysis, @inproceedings{rezanezhad2023eynollah,
title = {Document Layout Analysis with Deep Learning and Heuristics}, title = {Document Layout Analysis with Deep Learning and Heuristics},
author = {Rezanezhad, Vahid and Baierer, Konstantin and Gerber, Mike and Labusch, Kai and Neudecker, Clemens}, author = {Rezanezhad, Vahid and Baierer, Konstantin and Gerber, Mike and Labusch, Kai and Neudecker, Clemens},
booktitle = {Proceedings of the 7th International Workshop on Historical Document Imaging and Processing {HIP} 2023, booktitle = {Proceedings of the 7th International Workshop on Historical Document Imaging and Processing {HIP} 2023,

@ -1159,7 +1159,7 @@ class Eynollah:
processes[i].start() processes[i].start()
slopes = [] slopes = []
all_found_texline_polygons = [] all_found_textline_polygons = []
all_found_text_regions = [] all_found_text_regions = []
all_found_text_regions_par = [] all_found_text_regions_par = []
boxes = [] boxes = []
@ -1176,7 +1176,7 @@ class Eynollah:
indexes_for_subprocess = list_all_par[6] indexes_for_subprocess = list_all_par[6]
for j in range(len(slopes_for_sub_process)): for j in range(len(slopes_for_sub_process)):
slopes.append(slopes_for_sub_process[j]) slopes.append(slopes_for_sub_process[j])
all_found_texline_polygons.append(polys_for_sub_process[j]) all_found_textline_polygons.append(polys_for_sub_process[j])
boxes.append(boxes_for_sub_process[j]) boxes.append(boxes_for_sub_process[j])
all_found_text_regions.append(contours_for_subprocess[j]) all_found_text_regions.append(contours_for_subprocess[j])
all_found_text_regions_par.append(contours_par_for_subprocess[j]) all_found_text_regions_par.append(contours_par_for_subprocess[j])
@ -1186,7 +1186,7 @@ class Eynollah:
processes[i].join() processes[i].join()
self.logger.debug('slopes %s', slopes) self.logger.debug('slopes %s', slopes)
self.logger.debug("exit get_slopes_and_deskew_new") self.logger.debug("exit get_slopes_and_deskew_new")
return slopes, all_found_texline_polygons, boxes, all_found_text_regions, all_found_text_regions_par, all_box_coord, all_index_text_con return slopes, all_found_textline_polygons, boxes, all_found_text_regions, all_found_text_regions_par, all_box_coord, all_index_text_con
def get_slopes_and_deskew_new(self, contours, contours_par, textline_mask_tot, image_page_rotated, boxes, slope_deskew): def get_slopes_and_deskew_new(self, contours, contours_par, textline_mask_tot, image_page_rotated, boxes, slope_deskew):
self.logger.debug("enter get_slopes_and_deskew_new") self.logger.debug("enter get_slopes_and_deskew_new")
@ -1207,7 +1207,7 @@ class Eynollah:
processes[i].start() processes[i].start()
slopes = [] slopes = []
all_found_texline_polygons = [] all_found_textline_polygons = []
all_found_text_regions = [] all_found_text_regions = []
all_found_text_regions_par = [] all_found_text_regions_par = []
boxes = [] boxes = []
@ -1224,7 +1224,7 @@ class Eynollah:
indexes_for_subprocess = list_all_par[6] indexes_for_subprocess = list_all_par[6]
for j in range(len(slopes_for_sub_process)): for j in range(len(slopes_for_sub_process)):
slopes.append(slopes_for_sub_process[j]) slopes.append(slopes_for_sub_process[j])
all_found_texline_polygons.append(polys_for_sub_process[j]) all_found_textline_polygons.append(polys_for_sub_process[j])
boxes.append(boxes_for_sub_process[j]) boxes.append(boxes_for_sub_process[j])
all_found_text_regions.append(contours_for_subprocess[j]) all_found_text_regions.append(contours_for_subprocess[j])
all_found_text_regions_par.append(contours_par_for_subprocess[j]) all_found_text_regions_par.append(contours_par_for_subprocess[j])
@ -1234,7 +1234,7 @@ class Eynollah:
processes[i].join() processes[i].join()
self.logger.debug('slopes %s', slopes) self.logger.debug('slopes %s', slopes)
self.logger.debug("exit get_slopes_and_deskew_new") self.logger.debug("exit get_slopes_and_deskew_new")
return slopes, all_found_texline_polygons, boxes, all_found_text_regions, all_found_text_regions_par, all_box_coord, all_index_text_con return slopes, all_found_textline_polygons, boxes, all_found_text_regions, all_found_text_regions_par, all_box_coord, all_index_text_con
def get_slopes_and_deskew_new_curved(self, contours, contours_par, textline_mask_tot, image_page_rotated, boxes, mask_texts_only, num_col, scale_par, slope_deskew): def get_slopes_and_deskew_new_curved(self, contours, contours_par, textline_mask_tot, image_page_rotated, boxes, mask_texts_only, num_col, scale_par, slope_deskew):
self.logger.debug("enter get_slopes_and_deskew_new_curved") self.logger.debug("enter get_slopes_and_deskew_new_curved")
@ -1257,7 +1257,7 @@ class Eynollah:
processes[i].start() processes[i].start()
slopes = [] slopes = []
all_found_texline_polygons = [] all_found_textline_polygons = []
all_found_text_regions = [] all_found_text_regions = []
all_found_text_regions_par = [] all_found_text_regions_par = []
boxes = [] boxes = []
@ -1275,7 +1275,7 @@ class Eynollah:
slopes_for_sub_process = list_all_par[6] slopes_for_sub_process = list_all_par[6]
for j in range(len(polys_for_sub_process)): for j in range(len(polys_for_sub_process)):
slopes.append(slopes_for_sub_process[j]) slopes.append(slopes_for_sub_process[j])
all_found_texline_polygons.append(polys_for_sub_process[j][::-1]) all_found_textline_polygons.append(polys_for_sub_process[j][::-1])
boxes.append(boxes_for_sub_process[j]) boxes.append(boxes_for_sub_process[j])
all_found_text_regions.append(contours_for_subprocess[j]) all_found_text_regions.append(contours_for_subprocess[j])
all_found_text_regions_par.append(contours_par_for_subprocess[j]) all_found_text_regions_par.append(contours_par_for_subprocess[j])
@ -1285,7 +1285,7 @@ class Eynollah:
for i in range(num_cores): for i in range(num_cores):
processes[i].join() processes[i].join()
# print(slopes,'slopes') # print(slopes,'slopes')
return all_found_texline_polygons, boxes, all_found_text_regions, all_found_text_regions_par, all_box_coord, all_index_text_con, slopes return all_found_textline_polygons, boxes, all_found_text_regions, all_found_text_regions_par, all_box_coord, all_index_text_con, slopes
def do_work_of_slopes_new_curved(self, queue_of_all_params, boxes_text, textline_mask_tot_ea, contours_per_process, contours_par_per_process, image_page_rotated, mask_texts_only, num_col, scale_par, indexes_r_con_per_pro, slope_deskew): def do_work_of_slopes_new_curved(self, queue_of_all_params, boxes_text, textline_mask_tot_ea, contours_per_process, contours_par_per_process, image_page_rotated, mask_texts_only, num_col, scale_par, indexes_r_con_per_pro, slope_deskew):
self.logger.debug("enter do_work_of_slopes_new_curved") self.logger.debug("enter do_work_of_slopes_new_curved")
@ -3007,37 +3007,37 @@ class Eynollah:
if not self.curved_line: if not self.curved_line:
if self.light_version: if self.light_version:
if self.textline_light: if self.textline_light:
slopes, all_found_texline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con = self.get_slopes_and_deskew_new_light(txt_con_org, contours_only_text_parent, textline_mask_tot_ea_org, image_page_rotated, boxes_text, slope_deskew) slopes, all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con = self.get_slopes_and_deskew_new_light(txt_con_org, contours_only_text_parent, textline_mask_tot_ea_org, image_page_rotated, boxes_text, slope_deskew)
slopes_marginals, all_found_texline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, _ = self.get_slopes_and_deskew_new_light(polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea_org, image_page_rotated, boxes_marginals, slope_deskew) slopes_marginals, all_found_textline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, _ = self.get_slopes_and_deskew_new_light(polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea_org, image_page_rotated, boxes_marginals, slope_deskew)
else: else:
slopes, all_found_texline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con = self.get_slopes_and_deskew_new_light(txt_con_org, contours_only_text_parent, textline_mask_tot_ea, image_page_rotated, boxes_text, slope_deskew) slopes, all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con = self.get_slopes_and_deskew_new_light(txt_con_org, contours_only_text_parent, textline_mask_tot_ea, image_page_rotated, boxes_text, slope_deskew)
slopes_marginals, all_found_texline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, _ = self.get_slopes_and_deskew_new_light(polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea, image_page_rotated, boxes_marginals, slope_deskew) slopes_marginals, all_found_textline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, _ = self.get_slopes_and_deskew_new_light(polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea, image_page_rotated, boxes_marginals, slope_deskew)
else: else:
slopes, all_found_texline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con = self.get_slopes_and_deskew_new(txt_con_org, contours_only_text_parent, textline_mask_tot_ea, image_page_rotated, boxes_text, slope_deskew) slopes, all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con = self.get_slopes_and_deskew_new(txt_con_org, contours_only_text_parent, textline_mask_tot_ea, image_page_rotated, boxes_text, slope_deskew)
slopes_marginals, all_found_texline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, _ = self.get_slopes_and_deskew_new(polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea, image_page_rotated, boxes_marginals, slope_deskew) slopes_marginals, all_found_textline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, _ = self.get_slopes_and_deskew_new(polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea, image_page_rotated, boxes_marginals, slope_deskew)
else: else:
scale_param = 1 scale_param = 1
all_found_texline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con, slopes = self.get_slopes_and_deskew_new_curved(txt_con_org, contours_only_text_parent, cv2.erode(textline_mask_tot_ea, kernel=KERNEL, iterations=1), image_page_rotated, boxes_text, text_only, num_col_classifier, scale_param, slope_deskew) all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con, slopes = self.get_slopes_and_deskew_new_curved(txt_con_org, contours_only_text_parent, cv2.erode(textline_mask_tot_ea, kernel=KERNEL, iterations=1), image_page_rotated, boxes_text, text_only, num_col_classifier, scale_param, slope_deskew)
all_found_texline_polygons = small_textlines_to_parent_adherence2(all_found_texline_polygons, textline_mask_tot_ea, num_col_classifier) all_found_textline_polygons = small_textlines_to_parent_adherence2(all_found_textline_polygons, textline_mask_tot_ea, num_col_classifier)
all_found_texline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, _, slopes_marginals = self.get_slopes_and_deskew_new_curved(polygons_of_marginals, polygons_of_marginals, cv2.erode(textline_mask_tot_ea, kernel=KERNEL, iterations=1), image_page_rotated, boxes_marginals, text_only, num_col_classifier, scale_param, slope_deskew) all_found_textline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, _, slopes_marginals = self.get_slopes_and_deskew_new_curved(polygons_of_marginals, polygons_of_marginals, cv2.erode(textline_mask_tot_ea, kernel=KERNEL, iterations=1), image_page_rotated, boxes_marginals, text_only, num_col_classifier, scale_param, slope_deskew)
all_found_texline_polygons_marginals = small_textlines_to_parent_adherence2(all_found_texline_polygons_marginals, textline_mask_tot_ea, num_col_classifier) all_found_textline_polygons_marginals = small_textlines_to_parent_adherence2(all_found_textline_polygons_marginals, textline_mask_tot_ea, num_col_classifier)
if self.full_layout: if self.full_layout:
if np.abs(slope_deskew) >= SLOPE_THRESHOLD: if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered)[index_by_text_par_con]) contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered)[index_by_text_par_con])
if self.light_version: if self.light_version:
text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_texline_polygons, all_found_texline_polygons_h, slopes, slopes_h, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header_light(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_texline_polygons, slopes, contours_only_text_parent_d_ordered) text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_textline_polygons, all_found_textline_polygons_h, slopes, slopes_h, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header_light(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_textline_polygons, slopes, contours_only_text_parent_d_ordered)
else: else:
text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_texline_polygons, all_found_texline_polygons_h, slopes, slopes_h, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_texline_polygons, slopes, contours_only_text_parent_d_ordered) text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_textline_polygons, all_found_textline_polygons_h, slopes, slopes_h, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_textline_polygons, slopes, contours_only_text_parent_d_ordered)
else: else:
#takes long timee #takes long timee
contours_only_text_parent_d_ordered = None contours_only_text_parent_d_ordered = None
if self.light_version: if self.light_version:
text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_texline_polygons, all_found_texline_polygons_h, slopes, slopes_h, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header_light(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_texline_polygons, slopes, contours_only_text_parent_d_ordered) text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_textline_polygons, all_found_textline_polygons_h, slopes, slopes_h, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header_light(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_textline_polygons, slopes, contours_only_text_parent_d_ordered)
else: else:
text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_texline_polygons, all_found_texline_polygons_h, slopes, slopes_h, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_texline_polygons, slopes, contours_only_text_parent_d_ordered) text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_textline_polygons, all_found_textline_polygons_h, slopes, slopes_h, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_textline_polygons, slopes, contours_only_text_parent_d_ordered)
if self.plotter: if self.plotter:
self.plotter.save_plot_of_layout(text_regions_p, image_page) self.plotter.save_plot_of_layout(text_regions_p, image_page)
@ -3045,7 +3045,7 @@ class Eynollah:
pixel_img = 4 pixel_img = 4
polygons_of_drop_capitals = return_contours_of_interested_region_by_min_size(text_regions_p, pixel_img) polygons_of_drop_capitals = return_contours_of_interested_region_by_min_size(text_regions_p, pixel_img)
all_found_texline_polygons = adhere_drop_capital_region_into_corresponding_textline(text_regions_p, polygons_of_drop_capitals, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_texline_polygons, all_found_texline_polygons_h, kernel=KERNEL, curved_line=self.curved_line) all_found_textline_polygons = adhere_drop_capital_region_into_corresponding_textline(text_regions_p, polygons_of_drop_capitals, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_textline_polygons, all_found_textline_polygons_h, kernel=KERNEL, curved_line=self.curved_line)
pixel_lines = 6 pixel_lines = 6
@ -3091,7 +3091,7 @@ class Eynollah:
else: else:
order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered, boxes_d, textline_mask_tot_d) order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered, boxes_d, textline_mask_tot_d)
pcgts = self.writer.build_pagexml_full_layout(contours_only_text_parent, contours_only_text_parent_h, page_coord, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, polygons_of_images, contours_tables, polygons_of_drop_capitals, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_h, slopes_marginals, cont_page, polygons_lines_xml) pcgts = self.writer.build_pagexml_full_layout(contours_only_text_parent, contours_only_text_parent_h, page_coord, order_text_new, id_of_texts_tot, all_found_textline_polygons, all_found_textline_polygons_h, all_box_coord, all_box_coord_h, polygons_of_images, contours_tables, polygons_of_drop_capitals, polygons_of_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_h, slopes_marginals, cont_page, polygons_lines_xml)
self.logger.info("Job done in %.1fs", time.time() - t0) self.logger.info("Job done in %.1fs", time.time() - t0)
##return pcgts ##return pcgts
else: else:
@ -3101,7 +3101,7 @@ class Eynollah:
else: else:
contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered)[index_by_text_par_con]) contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered)[index_by_text_par_con])
order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h, boxes_d, textline_mask_tot_d) order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h, boxes_d, textline_mask_tot_d)
pcgts = self.writer.build_pagexml_no_full_layout(txt_con_org, page_coord, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, cont_page, polygons_lines_xml, contours_tables) pcgts = self.writer.build_pagexml_no_full_layout(txt_con_org, page_coord, order_text_new, id_of_texts_tot, all_found_textline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, cont_page, polygons_lines_xml, contours_tables)
self.logger.info("Job done in %.1fs", time.time() - t0) self.logger.info("Job done in %.1fs", time.time() - t0)
##return pcgts ##return pcgts
self.writer.write_pagexml(pcgts) self.writer.write_pagexml(pcgts)

@ -796,7 +796,7 @@ def putt_bb_of_drop_capitals_of_model_in_patches_in_layout(layout_in_patch):
return layout_in_patch return layout_in_patch
def check_any_text_region_in_model_one_is_main_or_header(regions_model_1,regions_model_full,contours_only_text_parent,all_box_coord,all_found_texline_polygons,slopes,contours_only_text_parent_d_ordered): def check_any_text_region_in_model_one_is_main_or_header(regions_model_1,regions_model_full,contours_only_text_parent,all_box_coord,all_found_textline_polygons,slopes,contours_only_text_parent_d_ordered):
cx_main,cy_main ,x_min_main , x_max_main, y_min_main ,y_max_main,y_corr_x_min_from_argmin=find_new_features_of_contours(contours_only_text_parent) cx_main,cy_main ,x_min_main , x_max_main, y_min_main ,y_max_main,y_corr_x_min_from_argmin=find_new_features_of_contours(contours_only_text_parent)
@ -805,8 +805,8 @@ def check_any_text_region_in_model_one_is_main_or_header(regions_model_1,regions
all_found_texline_polygons_main=[] all_found_textline_polygons_main=[]
all_found_texline_polygons_head=[] all_found_textline_polygons_head=[]
all_box_coord_main=[] all_box_coord_main=[]
all_box_coord_head=[] all_box_coord_head=[]
@ -840,7 +840,7 @@ def check_any_text_region_in_model_one_is_main_or_header(regions_model_1,regions
contours_only_text_parent_head_d.append(contours_only_text_parent_d_ordered[ii]) contours_only_text_parent_head_d.append(contours_only_text_parent_d_ordered[ii])
all_box_coord_head.append(all_box_coord[ii]) all_box_coord_head.append(all_box_coord[ii])
slopes_head.append(slopes[ii]) slopes_head.append(slopes[ii])
all_found_texline_polygons_head.append(all_found_texline_polygons[ii]) all_found_textline_polygons_head.append(all_found_textline_polygons[ii])
else: else:
regions_model_1[:,:][(regions_model_1[:,:]==1) & (img[:,:,0]==255) ]=1 regions_model_1[:,:][(regions_model_1[:,:]==1) & (img[:,:,0]==255) ]=1
contours_only_text_parent_main.append(con) contours_only_text_parent_main.append(con)
@ -848,14 +848,14 @@ def check_any_text_region_in_model_one_is_main_or_header(regions_model_1,regions
contours_only_text_parent_main_d.append(contours_only_text_parent_d_ordered[ii]) contours_only_text_parent_main_d.append(contours_only_text_parent_d_ordered[ii])
all_box_coord_main.append(all_box_coord[ii]) all_box_coord_main.append(all_box_coord[ii])
slopes_main.append(slopes[ii]) slopes_main.append(slopes[ii])
all_found_texline_polygons_main.append(all_found_texline_polygons[ii]) all_found_textline_polygons_main.append(all_found_textline_polygons[ii])
#print(all_pixels,pixels_main,pixels_header) #print(all_pixels,pixels_main,pixels_header)
return regions_model_1,contours_only_text_parent_main,contours_only_text_parent_head,all_box_coord_main,all_box_coord_head,all_found_texline_polygons_main,all_found_texline_polygons_head,slopes_main,slopes_head,contours_only_text_parent_main_d,contours_only_text_parent_head_d return regions_model_1,contours_only_text_parent_main,contours_only_text_parent_head,all_box_coord_main,all_box_coord_head,all_found_textline_polygons_main,all_found_textline_polygons_head,slopes_main,slopes_head,contours_only_text_parent_main_d,contours_only_text_parent_head_d
def check_any_text_region_in_model_one_is_main_or_header_light(regions_model_1,regions_model_full,contours_only_text_parent,all_box_coord,all_found_texline_polygons,slopes,contours_only_text_parent_d_ordered): def check_any_text_region_in_model_one_is_main_or_header_light(regions_model_1,regions_model_full,contours_only_text_parent,all_box_coord,all_found_textline_polygons,slopes,contours_only_text_parent_d_ordered):
### to make it faster ### to make it faster
h_o = regions_model_1.shape[0] h_o = regions_model_1.shape[0]
@ -874,8 +874,8 @@ def check_any_text_region_in_model_one_is_main_or_header_light(regions_model_1,r
all_found_texline_polygons_main=[] all_found_textline_polygons_main=[]
all_found_texline_polygons_head=[] all_found_textline_polygons_head=[]
all_box_coord_main=[] all_box_coord_main=[]
all_box_coord_head=[] all_box_coord_head=[]
@ -909,7 +909,7 @@ def check_any_text_region_in_model_one_is_main_or_header_light(regions_model_1,r
contours_only_text_parent_head_d.append(contours_only_text_parent_d_ordered[ii]) contours_only_text_parent_head_d.append(contours_only_text_parent_d_ordered[ii])
all_box_coord_head.append(all_box_coord[ii]) all_box_coord_head.append(all_box_coord[ii])
slopes_head.append(slopes[ii]) slopes_head.append(slopes[ii])
all_found_texline_polygons_head.append(all_found_texline_polygons[ii]) all_found_textline_polygons_head.append(all_found_textline_polygons[ii])
else: else:
regions_model_1[:,:][(regions_model_1[:,:]==1) & (img[:,:,0]==255) ]=1 regions_model_1[:,:][(regions_model_1[:,:]==1) & (img[:,:,0]==255) ]=1
contours_only_text_parent_main.append(con) contours_only_text_parent_main.append(con)
@ -917,7 +917,7 @@ def check_any_text_region_in_model_one_is_main_or_header_light(regions_model_1,r
contours_only_text_parent_main_d.append(contours_only_text_parent_d_ordered[ii]) contours_only_text_parent_main_d.append(contours_only_text_parent_d_ordered[ii])
all_box_coord_main.append(all_box_coord[ii]) all_box_coord_main.append(all_box_coord[ii])
slopes_main.append(slopes[ii]) slopes_main.append(slopes[ii])
all_found_texline_polygons_main.append(all_found_texline_polygons[ii]) all_found_textline_polygons_main.append(all_found_textline_polygons[ii])
#print(all_pixels,pixels_main,pixels_header) #print(all_pixels,pixels_main,pixels_header)
@ -931,7 +931,7 @@ def check_any_text_region_in_model_one_is_main_or_header_light(regions_model_1,r
contours_only_text_parent_main = [ (i*3.).astype(np.int32) for i in contours_only_text_parent_main] contours_only_text_parent_main = [ (i*3.).astype(np.int32) for i in contours_only_text_parent_main]
### ###
return regions_model_1,contours_only_text_parent_main,contours_only_text_parent_head,all_box_coord_main,all_box_coord_head,all_found_texline_polygons_main,all_found_texline_polygons_head,slopes_main,slopes_head,contours_only_text_parent_main_d,contours_only_text_parent_head_d return regions_model_1,contours_only_text_parent_main,contours_only_text_parent_head,all_box_coord_main,all_box_coord_head,all_found_textline_polygons_main,all_found_textline_polygons_head,slopes_main,slopes_head,contours_only_text_parent_main_d,contours_only_text_parent_head_d
def small_textlines_to_parent_adherence2(textlines_con, textline_iamge, num_col): def small_textlines_to_parent_adherence2(textlines_con, textline_iamge, num_col):
# print(textlines_con) # print(textlines_con)

@ -7,13 +7,13 @@ class EynollahIdCounter():
def __init__(self, region_idx=0, line_idx=0): def __init__(self, region_idx=0, line_idx=0):
self._counter = Counter() self._counter = Counter()
self._inital_region_idx = region_idx self._initial_region_idx = region_idx
self._inital_line_idx = line_idx self._initial_line_idx = line_idx
self.reset() self.reset()
def reset(self): def reset(self):
self.set('region', self._inital_region_idx) self.set('region', self._initial_region_idx)
self.set('line', self._inital_line_idx) self.set('line', self._initial_line_idx)
def inc(self, name, val=1): def inc(self, name, val=1):
self._counter.update({name: val}) self._counter.update({name: val})

@ -13,13 +13,13 @@ def adhere_drop_capital_region_into_corresponding_textline(
contours_only_text_parent_h, contours_only_text_parent_h,
all_box_coord, all_box_coord,
all_box_coord_h, all_box_coord_h,
all_found_texline_polygons, all_found_textline_polygons,
all_found_texline_polygons_h, all_found_textline_polygons_h,
kernel=None, kernel=None,
curved_line=False, curved_line=False,
): ):
# print(np.shape(all_found_texline_polygons),np.shape(all_found_texline_polygons[3]),'all_found_texline_polygonsshape') # print(np.shape(all_found_textline_polygons),np.shape(all_found_textline_polygons[3]),'all_found_textline_polygonsshape')
# print(all_found_texline_polygons[3]) # print(all_found_textline_polygons[3])
cx_m, cy_m, _, _, _, _, _ = find_new_features_of_contours(contours_only_text_parent) cx_m, cy_m, _, _, _, _, _ = find_new_features_of_contours(contours_only_text_parent)
cx_h, cy_h, _, _, _, _, _ = find_new_features_of_contours(contours_only_text_parent_h) cx_h, cy_h, _, _, _, _, _ = find_new_features_of_contours(contours_only_text_parent_h)
cx_d, cy_d, _, _, y_min_d, y_max_d, _ = find_new_features_of_contours(polygons_of_drop_capitals) cx_d, cy_d, _, _, y_min_d, y_max_d, _ = find_new_features_of_contours(polygons_of_drop_capitals)
@ -87,9 +87,9 @@ def adhere_drop_capital_region_into_corresponding_textline(
region_final = region_with_intersected_drop[np.argmax(sum_pixels_of_intersection)] - 1 region_final = region_with_intersected_drop[np.argmax(sum_pixels_of_intersection)] - 1
# print(region_final,'region_final') # print(region_final,'region_final')
# cx_t,cy_t ,_, _, _ ,_,_= find_new_features_of_contours(all_found_texline_polygons[int(region_final)]) # cx_t,cy_t ,_, _, _ ,_,_= find_new_features_of_contours(all_found_textline_polygons[int(region_final)])
try: try:
cx_t, cy_t, _, _, _, _, _ = find_new_features_of_contours(all_found_texline_polygons[int(region_final)]) cx_t, cy_t, _, _, _, _, _ = find_new_features_of_contours(all_found_textline_polygons[int(region_final)])
# print(all_box_coord[j_cont]) # print(all_box_coord[j_cont])
# print(cx_t) # print(cx_t)
# print(cy_t) # print(cy_t)
@ -105,9 +105,9 @@ def adhere_drop_capital_region_into_corresponding_textline(
arg_min = np.argmin(np.abs(y_lines - y_min_d[i_drop])) arg_min = np.argmin(np.abs(y_lines - y_min_d[i_drop]))
# print(arg_min) # print(arg_min)
cnt_nearest = np.copy(all_found_texline_polygons[int(region_final)][arg_min]) cnt_nearest = np.copy(all_found_textline_polygons[int(region_final)][arg_min])
cnt_nearest[:, 0, 0] = all_found_texline_polygons[int(region_final)][arg_min][:, 0, 0] # +all_box_coord[int(region_final)][2] cnt_nearest[:, 0, 0] = all_found_textline_polygons[int(region_final)][arg_min][:, 0, 0] # +all_box_coord[int(region_final)][2]
cnt_nearest[:, 0, 1] = all_found_texline_polygons[int(region_final)][arg_min][:, 0, 1] # +all_box_coord[int(region_final)][0] cnt_nearest[:, 0, 1] = all_found_textline_polygons[int(region_final)][arg_min][:, 0, 1] # +all_box_coord[int(region_final)][0]
img_textlines = np.zeros((text_regions_p.shape[0], text_regions_p.shape[1], 3)) img_textlines = np.zeros((text_regions_p.shape[0], text_regions_p.shape[1], 3))
img_textlines = cv2.fillPoly(img_textlines, pts=[cnt_nearest], color=(255, 255, 255)) img_textlines = cv2.fillPoly(img_textlines, pts=[cnt_nearest], color=(255, 255, 255))
@ -131,7 +131,7 @@ def adhere_drop_capital_region_into_corresponding_textline(
# contours_biggest=contours_biggest.reshape(np.shape(contours_biggest)[0],np.shape(contours_biggest)[2]) # contours_biggest=contours_biggest.reshape(np.shape(contours_biggest)[0],np.shape(contours_biggest)[2])
all_found_texline_polygons[int(region_final)][arg_min] = contours_biggest all_found_textline_polygons[int(region_final)][arg_min] = contours_biggest
except: except:
# print('gordun1') # print('gordun1')
@ -139,11 +139,11 @@ def adhere_drop_capital_region_into_corresponding_textline(
elif len(region_with_intersected_drop) == 1: elif len(region_with_intersected_drop) == 1:
region_final = region_with_intersected_drop[0] - 1 region_final = region_with_intersected_drop[0] - 1
# areas_main=np.array([cv2.contourArea(all_found_texline_polygons[int(region_final)][0][j] ) for j in range(len(all_found_texline_polygons[int(region_final)]))]) # areas_main=np.array([cv2.contourArea(all_found_textline_polygons[int(region_final)][0][j] ) for j in range(len(all_found_textline_polygons[int(region_final)]))])
# cx_t,cy_t ,_, _, _ ,_,_= find_new_features_of_contours(all_found_texline_polygons[int(region_final)]) # cx_t,cy_t ,_, _, _ ,_,_= find_new_features_of_contours(all_found_textline_polygons[int(region_final)])
try: try:
cx_t, cy_t, _, _, _, _, _ = find_new_features_of_contours(all_found_texline_polygons[int(region_final)]) cx_t, cy_t, _, _, _, _, _ = find_new_features_of_contours(all_found_textline_polygons[int(region_final)])
# print(all_box_coord[j_cont]) # print(all_box_coord[j_cont])
# print(cx_t) # print(cx_t)
# print(cy_t) # print(cy_t)
@ -157,9 +157,9 @@ def adhere_drop_capital_region_into_corresponding_textline(
arg_min = np.argmin(np.abs(y_lines - y_min_d[i_drop])) arg_min = np.argmin(np.abs(y_lines - y_min_d[i_drop]))
# print(arg_min) # print(arg_min)
cnt_nearest = np.copy(all_found_texline_polygons[int(region_final)][arg_min]) cnt_nearest = np.copy(all_found_textline_polygons[int(region_final)][arg_min])
cnt_nearest[:, 0, 0] = all_found_texline_polygons[int(region_final)][arg_min][:, 0, 0] # +all_box_coord[int(region_final)][2] cnt_nearest[:, 0, 0] = all_found_textline_polygons[int(region_final)][arg_min][:, 0, 0] # +all_box_coord[int(region_final)][2]
cnt_nearest[:, 0, 1] = all_found_texline_polygons[int(region_final)][arg_min][:, 0, 1] # +all_box_coord[int(region_final)][0] cnt_nearest[:, 0, 1] = all_found_textline_polygons[int(region_final)][arg_min][:, 0, 1] # +all_box_coord[int(region_final)][0]
img_textlines = np.zeros((text_regions_p.shape[0], text_regions_p.shape[1], 3)) img_textlines = np.zeros((text_regions_p.shape[0], text_regions_p.shape[1], 3))
img_textlines = cv2.fillPoly(img_textlines, pts=[cnt_nearest], color=(255, 255, 255)) img_textlines = cv2.fillPoly(img_textlines, pts=[cnt_nearest], color=(255, 255, 255))
@ -184,15 +184,15 @@ def adhere_drop_capital_region_into_corresponding_textline(
# contours_biggest[:,0,0]=contours_biggest[:,0,0]#-all_box_coord[int(region_final)][2] # contours_biggest[:,0,0]=contours_biggest[:,0,0]#-all_box_coord[int(region_final)][2]
# contours_biggest[:,0,1]=contours_biggest[:,0,1]#-all_box_coord[int(region_final)][0] # contours_biggest[:,0,1]=contours_biggest[:,0,1]#-all_box_coord[int(region_final)][0]
# print(np.shape(contours_biggest),'contours_biggest') # print(np.shape(contours_biggest),'contours_biggest')
# print(np.shape(all_found_texline_polygons[int(region_final)][arg_min])) # print(np.shape(all_found_textline_polygons[int(region_final)][arg_min]))
##contours_biggest=contours_biggest.reshape(np.shape(contours_biggest)[0],np.shape(contours_biggest)[2]) ##contours_biggest=contours_biggest.reshape(np.shape(contours_biggest)[0],np.shape(contours_biggest)[2])
all_found_texline_polygons[int(region_final)][arg_min] = contours_biggest all_found_textline_polygons[int(region_final)][arg_min] = contours_biggest
except: except:
pass pass
try: try:
# print(all_found_texline_polygons[j_cont][0]) # print(all_found_textline_polygons[j_cont][0])
cx_t, cy_t, _, _, _, _, _ = find_new_features_of_contours(all_found_texline_polygons[int(region_final)]) cx_t, cy_t, _, _, _, _, _ = find_new_features_of_contours(all_found_textline_polygons[int(region_final)])
# print(all_box_coord[j_cont]) # print(all_box_coord[j_cont])
# print(cx_t) # print(cx_t)
# print(cy_t) # print(cy_t)
@ -206,9 +206,9 @@ def adhere_drop_capital_region_into_corresponding_textline(
arg_min = np.argmin(np.abs(y_lines - y_min_d[i_drop])) arg_min = np.argmin(np.abs(y_lines - y_min_d[i_drop]))
# print(arg_min) # print(arg_min)
cnt_nearest = np.copy(all_found_texline_polygons[int(region_final)][arg_min]) cnt_nearest = np.copy(all_found_textline_polygons[int(region_final)][arg_min])
cnt_nearest[:, 0, 0] = all_found_texline_polygons[int(region_final)][arg_min][:, 0, 0] # +all_box_coord[int(region_final)][2] cnt_nearest[:, 0, 0] = all_found_textline_polygons[int(region_final)][arg_min][:, 0, 0] # +all_box_coord[int(region_final)][2]
cnt_nearest[:, 0, 1] = all_found_texline_polygons[int(region_final)][arg_min][:, 0, 1] # +all_box_coord[int(region_final)][0] cnt_nearest[:, 0, 1] = all_found_textline_polygons[int(region_final)][arg_min][:, 0, 1] # +all_box_coord[int(region_final)][0]
img_textlines = np.zeros((text_regions_p.shape[0], text_regions_p.shape[1], 3)) img_textlines = np.zeros((text_regions_p.shape[0], text_regions_p.shape[1], 3))
img_textlines = cv2.fillPoly(img_textlines, pts=[cnt_nearest], color=(255, 255, 255)) img_textlines = cv2.fillPoly(img_textlines, pts=[cnt_nearest], color=(255, 255, 255))
@ -231,15 +231,15 @@ def adhere_drop_capital_region_into_corresponding_textline(
contours_biggest[:, 0, 1] = contours_biggest[:, 0, 1] # -all_box_coord[int(region_final)][0] contours_biggest[:, 0, 1] = contours_biggest[:, 0, 1] # -all_box_coord[int(region_final)][0]
##contours_biggest=contours_biggest.reshape(np.shape(contours_biggest)[0],np.shape(contours_biggest)[2]) ##contours_biggest=contours_biggest.reshape(np.shape(contours_biggest)[0],np.shape(contours_biggest)[2])
all_found_texline_polygons[int(region_final)][arg_min] = contours_biggest all_found_textline_polygons[int(region_final)][arg_min] = contours_biggest
# all_found_texline_polygons[int(region_final)][arg_min]=contours_biggest # all_found_textline_polygons[int(region_final)][arg_min]=contours_biggest
except: except:
pass pass
else: else:
pass pass
##cx_t,cy_t ,_, _, _ ,_,_= find_new_features_of_contours(all_found_texline_polygons[int(region_final)]) ##cx_t,cy_t ,_, _, _ ,_,_= find_new_features_of_contours(all_found_textline_polygons[int(region_final)])
###print(all_box_coord[j_cont]) ###print(all_box_coord[j_cont])
###print(cx_t) ###print(cx_t)
###print(cy_t) ###print(cy_t)
@ -253,9 +253,9 @@ def adhere_drop_capital_region_into_corresponding_textline(
##arg_min=np.argmin(np.abs(y_lines-y_min_d[i_drop]) ) ##arg_min=np.argmin(np.abs(y_lines-y_min_d[i_drop]) )
###print(arg_min) ###print(arg_min)
##cnt_nearest=np.copy(all_found_texline_polygons[int(region_final)][arg_min]) ##cnt_nearest=np.copy(all_found_textline_polygons[int(region_final)][arg_min])
##cnt_nearest[:,0,0]=all_found_texline_polygons[int(region_final)][arg_min][:,0,0]#+all_box_coord[int(region_final)][2] ##cnt_nearest[:,0,0]=all_found_textline_polygons[int(region_final)][arg_min][:,0,0]#+all_box_coord[int(region_final)][2]
##cnt_nearest[:,0,1]=all_found_texline_polygons[int(region_final)][arg_min][:,0,1]#+all_box_coord[int(region_final)][0] ##cnt_nearest[:,0,1]=all_found_textline_polygons[int(region_final)][arg_min][:,0,1]#+all_box_coord[int(region_final)][0]
##img_textlines=np.zeros((text_regions_p.shape[0],text_regions_p.shape[1],3)) ##img_textlines=np.zeros((text_regions_p.shape[0],text_regions_p.shape[1],3))
##img_textlines=cv2.fillPoly(img_textlines,pts=[cnt_nearest],color=(255,255,255)) ##img_textlines=cv2.fillPoly(img_textlines,pts=[cnt_nearest],color=(255,255,255))
@ -281,7 +281,7 @@ def adhere_drop_capital_region_into_corresponding_textline(
##contours_biggest[:,0,1]=contours_biggest[:,0,1]#-all_box_coord[int(region_final)][0] ##contours_biggest[:,0,1]=contours_biggest[:,0,1]#-all_box_coord[int(region_final)][0]
##contours_biggest=contours_biggest.reshape(np.shape(contours_biggest)[0],np.shape(contours_biggest)[2]) ##contours_biggest=contours_biggest.reshape(np.shape(contours_biggest)[0],np.shape(contours_biggest)[2])
##all_found_texline_polygons[int(region_final)][arg_min]=contours_biggest ##all_found_textline_polygons[int(region_final)][arg_min]=contours_biggest
else: else:
if len(region_with_intersected_drop) > 1: if len(region_with_intersected_drop) > 1:
@ -293,9 +293,9 @@ def adhere_drop_capital_region_into_corresponding_textline(
region_final = region_with_intersected_drop[np.argmax(sum_pixels_of_intersection)] - 1 region_final = region_with_intersected_drop[np.argmax(sum_pixels_of_intersection)] - 1
# print(region_final,'region_final') # print(region_final,'region_final')
# cx_t,cy_t ,_, _, _ ,_,_= find_new_features_of_contours(all_found_texline_polygons[int(region_final)]) # cx_t,cy_t ,_, _, _ ,_,_= find_new_features_of_contours(all_found_textline_polygons[int(region_final)])
try: try:
cx_t, cy_t, _, _, _, _, _ = find_new_features_of_contours(all_found_texline_polygons[int(region_final)]) cx_t, cy_t, _, _, _, _, _ = find_new_features_of_contours(all_found_textline_polygons[int(region_final)])
# print(all_box_coord[j_cont]) # print(all_box_coord[j_cont])
# print(cx_t) # print(cx_t)
# print(cy_t) # print(cy_t)
@ -311,9 +311,9 @@ def adhere_drop_capital_region_into_corresponding_textline(
arg_min = np.argmin(np.abs(y_lines - y_min_d[i_drop])) arg_min = np.argmin(np.abs(y_lines - y_min_d[i_drop]))
# print(arg_min) # print(arg_min)
cnt_nearest = np.copy(all_found_texline_polygons[int(region_final)][arg_min]) cnt_nearest = np.copy(all_found_textline_polygons[int(region_final)][arg_min])
cnt_nearest[:, 0] = all_found_texline_polygons[int(region_final)][arg_min][:, 0] + all_box_coord[int(region_final)][2] cnt_nearest[:, 0] = all_found_textline_polygons[int(region_final)][arg_min][:, 0] + all_box_coord[int(region_final)][2]
cnt_nearest[:, 1] = all_found_texline_polygons[int(region_final)][arg_min][:, 1] + all_box_coord[int(region_final)][0] cnt_nearest[:, 1] = all_found_textline_polygons[int(region_final)][arg_min][:, 1] + all_box_coord[int(region_final)][0]
img_textlines = np.zeros((text_regions_p.shape[0], text_regions_p.shape[1], 3)) img_textlines = np.zeros((text_regions_p.shape[0], text_regions_p.shape[1], 3))
img_textlines = cv2.fillPoly(img_textlines, pts=[cnt_nearest], color=(255, 255, 255)) img_textlines = cv2.fillPoly(img_textlines, pts=[cnt_nearest], color=(255, 255, 255))
@ -337,7 +337,7 @@ def adhere_drop_capital_region_into_corresponding_textline(
contours_biggest = contours_biggest.reshape(np.shape(contours_biggest)[0], np.shape(contours_biggest)[2]) contours_biggest = contours_biggest.reshape(np.shape(contours_biggest)[0], np.shape(contours_biggest)[2])
all_found_texline_polygons[int(region_final)][arg_min] = contours_biggest all_found_textline_polygons[int(region_final)][arg_min] = contours_biggest
except: except:
# print('gordun1') # print('gordun1')
@ -345,14 +345,14 @@ def adhere_drop_capital_region_into_corresponding_textline(
elif len(region_with_intersected_drop) == 1: elif len(region_with_intersected_drop) == 1:
region_final = region_with_intersected_drop[0] - 1 region_final = region_with_intersected_drop[0] - 1
# areas_main=np.array([cv2.contourArea(all_found_texline_polygons[int(region_final)][0][j] ) for j in range(len(all_found_texline_polygons[int(region_final)]))]) # areas_main=np.array([cv2.contourArea(all_found_textline_polygons[int(region_final)][0][j] ) for j in range(len(all_found_textline_polygons[int(region_final)]))])
# cx_t,cy_t ,_, _, _ ,_,_= find_new_features_of_contours(all_found_texline_polygons[int(region_final)]) # cx_t,cy_t ,_, _, _ ,_,_= find_new_features_of_contours(all_found_textline_polygons[int(region_final)])
# print(cx_t,'print') # print(cx_t,'print')
try: try:
# print(all_found_texline_polygons[j_cont][0]) # print(all_found_textline_polygons[j_cont][0])
cx_t, cy_t, _, _, _, _, _ = find_new_features_of_contours(all_found_texline_polygons[int(region_final)]) cx_t, cy_t, _, _, _, _, _ = find_new_features_of_contours(all_found_textline_polygons[int(region_final)])
# print(all_box_coord[j_cont]) # print(all_box_coord[j_cont])
# print(cx_t) # print(cx_t)
# print(cy_t) # print(cy_t)
@ -366,9 +366,9 @@ def adhere_drop_capital_region_into_corresponding_textline(
arg_min = np.argmin(np.abs(y_lines - y_min_d[i_drop])) arg_min = np.argmin(np.abs(y_lines - y_min_d[i_drop]))
# print(arg_min) # print(arg_min)
cnt_nearest = np.copy(all_found_texline_polygons[int(region_final)][arg_min]) cnt_nearest = np.copy(all_found_textline_polygons[int(region_final)][arg_min])
cnt_nearest[:, 0] = all_found_texline_polygons[int(region_final)][arg_min][:, 0] + all_box_coord[int(region_final)][2] cnt_nearest[:, 0] = all_found_textline_polygons[int(region_final)][arg_min][:, 0] + all_box_coord[int(region_final)][2]
cnt_nearest[:, 1] = all_found_texline_polygons[int(region_final)][arg_min][:, 1] + all_box_coord[int(region_final)][0] cnt_nearest[:, 1] = all_found_textline_polygons[int(region_final)][arg_min][:, 1] + all_box_coord[int(region_final)][0]
img_textlines = np.zeros((text_regions_p.shape[0], text_regions_p.shape[1], 3)) img_textlines = np.zeros((text_regions_p.shape[0], text_regions_p.shape[1], 3))
img_textlines = cv2.fillPoly(img_textlines, pts=[cnt_nearest], color=(255, 255, 255)) img_textlines = cv2.fillPoly(img_textlines, pts=[cnt_nearest], color=(255, 255, 255))
@ -391,8 +391,8 @@ def adhere_drop_capital_region_into_corresponding_textline(
contours_biggest[:, 0, 1] = contours_biggest[:, 0, 1] - all_box_coord[int(region_final)][0] contours_biggest[:, 0, 1] = contours_biggest[:, 0, 1] - all_box_coord[int(region_final)][0]
contours_biggest = contours_biggest.reshape(np.shape(contours_biggest)[0], np.shape(contours_biggest)[2]) contours_biggest = contours_biggest.reshape(np.shape(contours_biggest)[0], np.shape(contours_biggest)[2])
all_found_texline_polygons[int(region_final)][arg_min] = contours_biggest all_found_textline_polygons[int(region_final)][arg_min] = contours_biggest
# all_found_texline_polygons[int(region_final)][arg_min]=contours_biggest # all_found_textline_polygons[int(region_final)][arg_min]=contours_biggest
except: except:
pass pass
@ -417,8 +417,8 @@ def adhere_drop_capital_region_into_corresponding_textline(
######plt.show() ######plt.show()
#####try: #####try:
#####if len(contours_new_parent)==1: #####if len(contours_new_parent)==1:
######print(all_found_texline_polygons[j_cont][0]) ######print(all_found_textline_polygons[j_cont][0])
#####cx_t,cy_t ,_, _, _ ,_,_= find_new_features_of_contours(all_found_texline_polygons[j_cont]) #####cx_t,cy_t ,_, _, _ ,_,_= find_new_features_of_contours(all_found_textline_polygons[j_cont])
######print(all_box_coord[j_cont]) ######print(all_box_coord[j_cont])
######print(cx_t) ######print(cx_t)
######print(cy_t) ######print(cy_t)
@ -431,9 +431,9 @@ def adhere_drop_capital_region_into_corresponding_textline(
#####arg_min=np.argmin(np.abs(y_lines-y_min_d[i_drop]) ) #####arg_min=np.argmin(np.abs(y_lines-y_min_d[i_drop]) )
######print(arg_min) ######print(arg_min)
#####cnt_nearest=np.copy(all_found_texline_polygons[j_cont][arg_min]) #####cnt_nearest=np.copy(all_found_textline_polygons[j_cont][arg_min])
#####cnt_nearest[:,0]=all_found_texline_polygons[j_cont][arg_min][:,0]+all_box_coord[j_cont][2] #####cnt_nearest[:,0]=all_found_textline_polygons[j_cont][arg_min][:,0]+all_box_coord[j_cont][2]
#####cnt_nearest[:,1]=all_found_texline_polygons[j_cont][arg_min][:,1]+all_box_coord[j_cont][0] #####cnt_nearest[:,1]=all_found_textline_polygons[j_cont][arg_min][:,1]+all_box_coord[j_cont][0]
#####img_textlines=np.zeros((text_regions_p.shape[0],text_regions_p.shape[1],3)) #####img_textlines=np.zeros((text_regions_p.shape[0],text_regions_p.shape[1],3))
#####img_textlines=cv2.fillPoly(img_textlines,pts=[cnt_nearest],color=(255,255,255)) #####img_textlines=cv2.fillPoly(img_textlines,pts=[cnt_nearest],color=(255,255,255))
@ -454,7 +454,7 @@ def adhere_drop_capital_region_into_corresponding_textline(
#####contours_biggest[:,0,0]=contours_biggest[:,0,0]-all_box_coord[j_cont][2] #####contours_biggest[:,0,0]=contours_biggest[:,0,0]-all_box_coord[j_cont][2]
#####contours_biggest[:,0,1]=contours_biggest[:,0,1]-all_box_coord[j_cont][0] #####contours_biggest[:,0,1]=contours_biggest[:,0,1]-all_box_coord[j_cont][0]
#####all_found_texline_polygons[j_cont][arg_min]=contours_biggest #####all_found_textline_polygons[j_cont][arg_min]=contours_biggest
######print(contours_biggest) ######print(contours_biggest)
######plt.imshow(img_textlines[:,:,0]) ######plt.imshow(img_textlines[:,:,0])
######plt.show() ######plt.show()
@ -462,7 +462,7 @@ def adhere_drop_capital_region_into_corresponding_textline(
#####pass #####pass
#####except: #####except:
#####pass #####pass
return all_found_texline_polygons return all_found_textline_polygons
def filter_small_drop_capitals_from_no_patch_layout(layout_no_patch, layout1): def filter_small_drop_capitals_from_no_patch_layout(layout_no_patch, layout1):

@ -54,54 +54,54 @@ class EynollahXmlWriter():
points_page_print = points_page_print + ' ' points_page_print = points_page_print + ' '
return points_page_print[:-1] return points_page_print[:-1]
def serialize_lines_in_marginal(self, marginal_region, all_found_texline_polygons_marginals, marginal_idx, page_coord, all_box_coord_marginals, slopes_marginals, counter): def serialize_lines_in_marginal(self, marginal_region, all_found_textline_polygons_marginals, marginal_idx, page_coord, all_box_coord_marginals, slopes_marginals, counter):
for j in range(len(all_found_texline_polygons_marginals[marginal_idx])): for j in range(len(all_found_textline_polygons_marginals[marginal_idx])):
coords = CoordsType() coords = CoordsType()
textline = TextLineType(id=counter.next_line_id, Coords=coords) textline = TextLineType(id=counter.next_line_id, Coords=coords)
marginal_region.add_TextLine(textline) marginal_region.add_TextLine(textline)
points_co = '' points_co = ''
for l in range(len(all_found_texline_polygons_marginals[marginal_idx][j])): for l in range(len(all_found_textline_polygons_marginals[marginal_idx][j])):
if not (self.curved_line or self.textline_light): if not (self.curved_line or self.textline_light):
if len(all_found_texline_polygons_marginals[marginal_idx][j][l]) == 2: if len(all_found_textline_polygons_marginals[marginal_idx][j][l]) == 2:
textline_x_coord = max(0, int((all_found_texline_polygons_marginals[marginal_idx][j][l][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x) ) textline_x_coord = max(0, int((all_found_textline_polygons_marginals[marginal_idx][j][l][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x) )
textline_y_coord = max(0, int((all_found_texline_polygons_marginals[marginal_idx][j][l][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0]) / self.scale_y) ) textline_y_coord = max(0, int((all_found_textline_polygons_marginals[marginal_idx][j][l][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0]) / self.scale_y) )
else: else:
textline_x_coord = max(0, int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x) ) textline_x_coord = max(0, int((all_found_textline_polygons_marginals[marginal_idx][j][l][0][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x) )
textline_y_coord = max(0, int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0]) / self.scale_y) ) textline_y_coord = max(0, int((all_found_textline_polygons_marginals[marginal_idx][j][l][0][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0]) / self.scale_y) )
points_co += str(textline_x_coord) points_co += str(textline_x_coord)
points_co += ',' points_co += ','
points_co += str(textline_y_coord) points_co += str(textline_y_coord)
if (self.curved_line or self.textline_light) and np.abs(slopes_marginals[marginal_idx]) <= 45: if (self.curved_line or self.textline_light) and np.abs(slopes_marginals[marginal_idx]) <= 45:
if len(all_found_texline_polygons_marginals[marginal_idx][j][l]) == 2: if len(all_found_textline_polygons_marginals[marginal_idx][j][l]) == 2:
points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0] + page_coord[2]) / self.scale_x)) points_co += str(int((all_found_textline_polygons_marginals[marginal_idx][j][l][0] + page_coord[2]) / self.scale_x))
points_co += ',' points_co += ','
points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][1] + page_coord[0]) / self.scale_y)) points_co += str(int((all_found_textline_polygons_marginals[marginal_idx][j][l][1] + page_coord[0]) / self.scale_y))
else: else:
points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][0] + page_coord[2]) / self.scale_x)) points_co += str(int((all_found_textline_polygons_marginals[marginal_idx][j][l][0][0] + page_coord[2]) / self.scale_x))
points_co += ',' points_co += ','
points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][1] + page_coord[0]) / self.scale_y)) points_co += str(int((all_found_textline_polygons_marginals[marginal_idx][j][l][0][1] + page_coord[0]) / self.scale_y))
elif (self.curved_line or self.textline_light) and np.abs(slopes_marginals[marginal_idx]) > 45: elif (self.curved_line or self.textline_light) and np.abs(slopes_marginals[marginal_idx]) > 45:
if len(all_found_texline_polygons_marginals[marginal_idx][j][l]) == 2: if len(all_found_textline_polygons_marginals[marginal_idx][j][l]) == 2:
points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x)) points_co += str(int((all_found_textline_polygons_marginals[marginal_idx][j][l][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x))
points_co += ',' points_co += ','
points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0]) / self.scale_y)) points_co += str(int((all_found_textline_polygons_marginals[marginal_idx][j][l][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0]) / self.scale_y))
else: else:
points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x)) points_co += str(int((all_found_textline_polygons_marginals[marginal_idx][j][l][0][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x))
points_co += ',' points_co += ','
points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0]) / self.scale_y)) points_co += str(int((all_found_textline_polygons_marginals[marginal_idx][j][l][0][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0]) / self.scale_y))
points_co += ' ' points_co += ' '
coords.set_points(points_co[:-1]) coords.set_points(points_co[:-1])
def serialize_lines_in_region(self, text_region, all_found_texline_polygons, region_idx, page_coord, all_box_coord, slopes, counter): def serialize_lines_in_region(self, text_region, all_found_textline_polygons, region_idx, page_coord, all_box_coord, slopes, counter):
self.logger.debug('enter serialize_lines_in_region') self.logger.debug('enter serialize_lines_in_region')
for j in range(len(all_found_texline_polygons[region_idx])): for j in range(len(all_found_textline_polygons[region_idx])):
coords = CoordsType() coords = CoordsType()
textline = TextLineType(id=counter.next_line_id, Coords=coords) textline = TextLineType(id=counter.next_line_id, Coords=coords)
text_region.add_TextLine(textline) text_region.add_TextLine(textline)
region_bboxes = all_box_coord[region_idx] region_bboxes = all_box_coord[region_idx]
points_co = '' points_co = ''
for idx_contour_textline, contour_textline in enumerate(all_found_texline_polygons[region_idx][j]): for idx_contour_textline, contour_textline in enumerate(all_found_textline_polygons[region_idx][j]):
if not (self.curved_line or self.textline_light): if not (self.curved_line or self.textline_light):
if len(contour_textline) == 2: if len(contour_textline) == 2:
textline_x_coord = max(0, int((contour_textline[0] + region_bboxes[2] + page_coord[2]) / self.scale_x)) textline_x_coord = max(0, int((contour_textline[0] + region_bboxes[2] + page_coord[2]) / self.scale_x))
@ -140,7 +140,7 @@ class EynollahXmlWriter():
with open(out_fname, 'w') as f: with open(out_fname, 'w') as f:
f.write(to_xml(pcgts)) f.write(to_xml(pcgts))
def build_pagexml_no_full_layout(self, found_polygons_text_region, page_coord, order_of_texts, id_of_texts, all_found_texline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, cont_page, polygons_lines_to_be_written_in_xml, found_polygons_tables): def build_pagexml_no_full_layout(self, found_polygons_text_region, page_coord, order_of_texts, id_of_texts, all_found_textline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, cont_page, polygons_lines_to_be_written_in_xml, found_polygons_tables):
self.logger.debug('enter build_pagexml_no_full_layout') self.logger.debug('enter build_pagexml_no_full_layout')
# create the file structure # create the file structure
@ -159,13 +159,13 @@ class EynollahXmlWriter():
Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region[mm], page_coord)), Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region[mm], page_coord)),
) )
page.add_TextRegion(textregion) page.add_TextRegion(textregion)
self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, counter) self.serialize_lines_in_region(textregion, all_found_textline_polygons, mm, page_coord, all_box_coord, slopes, counter)
for mm in range(len(found_polygons_marginals)): for mm in range(len(found_polygons_marginals)):
marginal = TextRegionType(id=counter.next_region_id, type_='marginalia', marginal = TextRegionType(id=counter.next_region_id, type_='marginalia',
Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_marginals[mm], page_coord))) Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_marginals[mm], page_coord)))
page.add_TextRegion(marginal) page.add_TextRegion(marginal)
self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, counter) self.serialize_lines_in_marginal(marginal, all_found_textline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, counter)
for mm in range(len(found_polygons_text_region_img)): for mm in range(len(found_polygons_text_region_img)):
img_region = ImageRegionType(id=counter.next_region_id, Coords=CoordsType()) img_region = ImageRegionType(id=counter.next_region_id, Coords=CoordsType())
@ -201,7 +201,7 @@ class EynollahXmlWriter():
return pcgts return pcgts
def build_pagexml_full_layout(self, found_polygons_text_region, found_polygons_text_region_h, page_coord, order_of_texts, id_of_texts, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_h, slopes_marginals, cont_page, polygons_lines_to_be_written_in_xml): def build_pagexml_full_layout(self, found_polygons_text_region, found_polygons_text_region_h, page_coord, order_of_texts, id_of_texts, all_found_textline_polygons, all_found_textline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, found_polygons_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_h, slopes_marginals, cont_page, polygons_lines_to_be_written_in_xml):
self.logger.debug('enter build_pagexml_full_layout') self.logger.debug('enter build_pagexml_full_layout')
# create the file structure # create the file structure
@ -218,20 +218,20 @@ class EynollahXmlWriter():
textregion = TextRegionType(id=counter.next_region_id, type_='paragraph', textregion = TextRegionType(id=counter.next_region_id, type_='paragraph',
Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region[mm], page_coord))) Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region[mm], page_coord)))
page.add_TextRegion(textregion) page.add_TextRegion(textregion)
self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, counter) self.serialize_lines_in_region(textregion, all_found_textline_polygons, mm, page_coord, all_box_coord, slopes, counter)
self.logger.debug('len(found_polygons_text_region_h) %s', len(found_polygons_text_region_h)) self.logger.debug('len(found_polygons_text_region_h) %s', len(found_polygons_text_region_h))
for mm in range(len(found_polygons_text_region_h)): for mm in range(len(found_polygons_text_region_h)):
textregion = TextRegionType(id=counter.next_region_id, type_='header', textregion = TextRegionType(id=counter.next_region_id, type_='header',
Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region_h[mm], page_coord))) Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region_h[mm], page_coord)))
page.add_TextRegion(textregion) page.add_TextRegion(textregion)
self.serialize_lines_in_region(textregion, all_found_texline_polygons_h, mm, page_coord, all_box_coord_h, slopes_h, counter) self.serialize_lines_in_region(textregion, all_found_textline_polygons_h, mm, page_coord, all_box_coord_h, slopes_h, counter)
for mm in range(len(found_polygons_marginals)): for mm in range(len(found_polygons_marginals)):
marginal = TextRegionType(id=counter.next_region_id, type_='marginalia', marginal = TextRegionType(id=counter.next_region_id, type_='marginalia',
Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_marginals[mm], page_coord))) Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_marginals[mm], page_coord)))
page.add_TextRegion(marginal) page.add_TextRegion(marginal)
self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, counter) self.serialize_lines_in_marginal(marginal, all_found_textline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, counter)
for mm in range(len(found_polygons_drop_capitals)): for mm in range(len(found_polygons_drop_capitals)):
page.add_TextRegion(TextRegionType(id=counter.next_region_id, type_='drop-capital', page.add_TextRegion(TextRegionType(id=counter.next_region_id, type_='drop-capital',

Loading…
Cancel
Save