diff --git a/README.md b/README.md index d74c4a9..f6bc794 100644 --- a/README.md +++ b/README.md @@ -63,7 +63,7 @@ The following options can be used to further configure the processing: | `-tab` | apply table detection | | `-ae` | apply enhancement (the resulting image is saved to the output directory) | | `-as` | apply scaling | -| `-cl` | apply countour detection for curved text lines instead of bounding boxes | +| `-cl` | apply contour detection for curved text lines instead of bounding boxes | | `-ib` | apply binarization (the resulting image is saved to the output directory) | | `-ep` | enable plotting (MUST always be used with `-sl`, `-sd`, `-sa`, `-si` or `-ae`) | | `-ho` | ignore headers for reading order dectection | @@ -99,7 +99,7 @@ uses the original (RGB) image despite any binarization that may have occured in If you find this tool useful in your work, please consider citing our paper: ```bibtex -@inproceedings{rezanezhad2023documentlayoutanalysis, +@inproceedings{rezanezhad2023eynollah, title = {Document Layout Analysis with Deep Learning and Heuristics}, author = {Rezanezhad, Vahid and Baierer, Konstantin and Gerber, Mike and Labusch, Kai and Neudecker, Clemens}, booktitle = {Proceedings of the 7th International Workshop on Historical Document Imaging and Processing {HIP} 2023, diff --git a/qurator/eynollah/eynollah.py b/qurator/eynollah/eynollah.py index ad3f312..625f39a 100644 --- a/qurator/eynollah/eynollah.py +++ b/qurator/eynollah/eynollah.py @@ -1159,7 +1159,7 @@ class Eynollah: processes[i].start() slopes = [] - all_found_texline_polygons = [] + all_found_textline_polygons = [] all_found_text_regions = [] all_found_text_regions_par = [] boxes = [] @@ -1176,7 +1176,7 @@ class Eynollah: indexes_for_subprocess = list_all_par[6] for j in range(len(slopes_for_sub_process)): slopes.append(slopes_for_sub_process[j]) - all_found_texline_polygons.append(polys_for_sub_process[j]) + all_found_textline_polygons.append(polys_for_sub_process[j]) boxes.append(boxes_for_sub_process[j]) all_found_text_regions.append(contours_for_subprocess[j]) all_found_text_regions_par.append(contours_par_for_subprocess[j]) @@ -1186,7 +1186,7 @@ class Eynollah: processes[i].join() self.logger.debug('slopes %s', slopes) self.logger.debug("exit get_slopes_and_deskew_new") - return slopes, all_found_texline_polygons, boxes, all_found_text_regions, all_found_text_regions_par, all_box_coord, all_index_text_con + return slopes, all_found_textline_polygons, boxes, all_found_text_regions, all_found_text_regions_par, all_box_coord, all_index_text_con def get_slopes_and_deskew_new(self, contours, contours_par, textline_mask_tot, image_page_rotated, boxes, slope_deskew): self.logger.debug("enter get_slopes_and_deskew_new") @@ -1207,7 +1207,7 @@ class Eynollah: processes[i].start() slopes = [] - all_found_texline_polygons = [] + all_found_textline_polygons = [] all_found_text_regions = [] all_found_text_regions_par = [] boxes = [] @@ -1224,7 +1224,7 @@ class Eynollah: indexes_for_subprocess = list_all_par[6] for j in range(len(slopes_for_sub_process)): slopes.append(slopes_for_sub_process[j]) - all_found_texline_polygons.append(polys_for_sub_process[j]) + all_found_textline_polygons.append(polys_for_sub_process[j]) boxes.append(boxes_for_sub_process[j]) all_found_text_regions.append(contours_for_subprocess[j]) all_found_text_regions_par.append(contours_par_for_subprocess[j]) @@ -1234,7 +1234,7 @@ class Eynollah: processes[i].join() self.logger.debug('slopes %s', slopes) self.logger.debug("exit get_slopes_and_deskew_new") - return slopes, all_found_texline_polygons, boxes, all_found_text_regions, all_found_text_regions_par, all_box_coord, all_index_text_con + return slopes, all_found_textline_polygons, boxes, all_found_text_regions, all_found_text_regions_par, all_box_coord, all_index_text_con def get_slopes_and_deskew_new_curved(self, contours, contours_par, textline_mask_tot, image_page_rotated, boxes, mask_texts_only, num_col, scale_par, slope_deskew): self.logger.debug("enter get_slopes_and_deskew_new_curved") @@ -1257,7 +1257,7 @@ class Eynollah: processes[i].start() slopes = [] - all_found_texline_polygons = [] + all_found_textline_polygons = [] all_found_text_regions = [] all_found_text_regions_par = [] boxes = [] @@ -1275,7 +1275,7 @@ class Eynollah: slopes_for_sub_process = list_all_par[6] for j in range(len(polys_for_sub_process)): slopes.append(slopes_for_sub_process[j]) - all_found_texline_polygons.append(polys_for_sub_process[j][::-1]) + all_found_textline_polygons.append(polys_for_sub_process[j][::-1]) boxes.append(boxes_for_sub_process[j]) all_found_text_regions.append(contours_for_subprocess[j]) all_found_text_regions_par.append(contours_par_for_subprocess[j]) @@ -1285,7 +1285,7 @@ class Eynollah: for i in range(num_cores): processes[i].join() # print(slopes,'slopes') - return all_found_texline_polygons, boxes, all_found_text_regions, all_found_text_regions_par, all_box_coord, all_index_text_con, slopes + return all_found_textline_polygons, boxes, all_found_text_regions, all_found_text_regions_par, all_box_coord, all_index_text_con, slopes def do_work_of_slopes_new_curved(self, queue_of_all_params, boxes_text, textline_mask_tot_ea, contours_per_process, contours_par_per_process, image_page_rotated, mask_texts_only, num_col, scale_par, indexes_r_con_per_pro, slope_deskew): self.logger.debug("enter do_work_of_slopes_new_curved") @@ -3007,37 +3007,37 @@ class Eynollah: if not self.curved_line: if self.light_version: if self.textline_light: - slopes, all_found_texline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con = self.get_slopes_and_deskew_new_light(txt_con_org, contours_only_text_parent, textline_mask_tot_ea_org, image_page_rotated, boxes_text, slope_deskew) - slopes_marginals, all_found_texline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, _ = self.get_slopes_and_deskew_new_light(polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea_org, image_page_rotated, boxes_marginals, slope_deskew) + slopes, all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con = self.get_slopes_and_deskew_new_light(txt_con_org, contours_only_text_parent, textline_mask_tot_ea_org, image_page_rotated, boxes_text, slope_deskew) + slopes_marginals, all_found_textline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, _ = self.get_slopes_and_deskew_new_light(polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea_org, image_page_rotated, boxes_marginals, slope_deskew) else: - slopes, all_found_texline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con = self.get_slopes_and_deskew_new_light(txt_con_org, contours_only_text_parent, textline_mask_tot_ea, image_page_rotated, boxes_text, slope_deskew) - slopes_marginals, all_found_texline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, _ = self.get_slopes_and_deskew_new_light(polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea, image_page_rotated, boxes_marginals, slope_deskew) + slopes, all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con = self.get_slopes_and_deskew_new_light(txt_con_org, contours_only_text_parent, textline_mask_tot_ea, image_page_rotated, boxes_text, slope_deskew) + slopes_marginals, all_found_textline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, _ = self.get_slopes_and_deskew_new_light(polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea, image_page_rotated, boxes_marginals, slope_deskew) else: - slopes, all_found_texline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con = self.get_slopes_and_deskew_new(txt_con_org, contours_only_text_parent, textline_mask_tot_ea, image_page_rotated, boxes_text, slope_deskew) - slopes_marginals, all_found_texline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, _ = self.get_slopes_and_deskew_new(polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea, image_page_rotated, boxes_marginals, slope_deskew) + slopes, all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con = self.get_slopes_and_deskew_new(txt_con_org, contours_only_text_parent, textline_mask_tot_ea, image_page_rotated, boxes_text, slope_deskew) + slopes_marginals, all_found_textline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, _ = self.get_slopes_and_deskew_new(polygons_of_marginals, polygons_of_marginals, textline_mask_tot_ea, image_page_rotated, boxes_marginals, slope_deskew) else: scale_param = 1 - all_found_texline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con, slopes = self.get_slopes_and_deskew_new_curved(txt_con_org, contours_only_text_parent, cv2.erode(textline_mask_tot_ea, kernel=KERNEL, iterations=1), image_page_rotated, boxes_text, text_only, num_col_classifier, scale_param, slope_deskew) - all_found_texline_polygons = small_textlines_to_parent_adherence2(all_found_texline_polygons, textline_mask_tot_ea, num_col_classifier) - all_found_texline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, _, slopes_marginals = self.get_slopes_and_deskew_new_curved(polygons_of_marginals, polygons_of_marginals, cv2.erode(textline_mask_tot_ea, kernel=KERNEL, iterations=1), image_page_rotated, boxes_marginals, text_only, num_col_classifier, scale_param, slope_deskew) - all_found_texline_polygons_marginals = small_textlines_to_parent_adherence2(all_found_texline_polygons_marginals, textline_mask_tot_ea, num_col_classifier) + all_found_textline_polygons, boxes_text, txt_con_org, contours_only_text_parent, all_box_coord, index_by_text_par_con, slopes = self.get_slopes_and_deskew_new_curved(txt_con_org, contours_only_text_parent, cv2.erode(textline_mask_tot_ea, kernel=KERNEL, iterations=1), image_page_rotated, boxes_text, text_only, num_col_classifier, scale_param, slope_deskew) + all_found_textline_polygons = small_textlines_to_parent_adherence2(all_found_textline_polygons, textline_mask_tot_ea, num_col_classifier) + all_found_textline_polygons_marginals, boxes_marginals, _, polygons_of_marginals, all_box_coord_marginals, _, slopes_marginals = self.get_slopes_and_deskew_new_curved(polygons_of_marginals, polygons_of_marginals, cv2.erode(textline_mask_tot_ea, kernel=KERNEL, iterations=1), image_page_rotated, boxes_marginals, text_only, num_col_classifier, scale_param, slope_deskew) + all_found_textline_polygons_marginals = small_textlines_to_parent_adherence2(all_found_textline_polygons_marginals, textline_mask_tot_ea, num_col_classifier) if self.full_layout: if np.abs(slope_deskew) >= SLOPE_THRESHOLD: contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered)[index_by_text_par_con]) if self.light_version: - text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_texline_polygons, all_found_texline_polygons_h, slopes, slopes_h, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header_light(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_texline_polygons, slopes, contours_only_text_parent_d_ordered) + text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_textline_polygons, all_found_textline_polygons_h, slopes, slopes_h, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header_light(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_textline_polygons, slopes, contours_only_text_parent_d_ordered) else: - text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_texline_polygons, all_found_texline_polygons_h, slopes, slopes_h, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_texline_polygons, slopes, contours_only_text_parent_d_ordered) + text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_textline_polygons, all_found_textline_polygons_h, slopes, slopes_h, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_textline_polygons, slopes, contours_only_text_parent_d_ordered) else: #takes long timee contours_only_text_parent_d_ordered = None if self.light_version: - text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_texline_polygons, all_found_texline_polygons_h, slopes, slopes_h, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header_light(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_texline_polygons, slopes, contours_only_text_parent_d_ordered) + text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_textline_polygons, all_found_textline_polygons_h, slopes, slopes_h, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header_light(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_textline_polygons, slopes, contours_only_text_parent_d_ordered) else: - text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_texline_polygons, all_found_texline_polygons_h, slopes, slopes_h, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_texline_polygons, slopes, contours_only_text_parent_d_ordered) + text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_textline_polygons, all_found_textline_polygons_h, slopes, slopes_h, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_textline_polygons, slopes, contours_only_text_parent_d_ordered) if self.plotter: self.plotter.save_plot_of_layout(text_regions_p, image_page) @@ -3045,7 +3045,7 @@ class Eynollah: pixel_img = 4 polygons_of_drop_capitals = return_contours_of_interested_region_by_min_size(text_regions_p, pixel_img) - all_found_texline_polygons = adhere_drop_capital_region_into_corresponding_textline(text_regions_p, polygons_of_drop_capitals, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_texline_polygons, all_found_texline_polygons_h, kernel=KERNEL, curved_line=self.curved_line) + all_found_textline_polygons = adhere_drop_capital_region_into_corresponding_textline(text_regions_p, polygons_of_drop_capitals, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_textline_polygons, all_found_textline_polygons_h, kernel=KERNEL, curved_line=self.curved_line) pixel_lines = 6 @@ -3091,7 +3091,7 @@ class Eynollah: else: order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered, boxes_d, textline_mask_tot_d) - pcgts = self.writer.build_pagexml_full_layout(contours_only_text_parent, contours_only_text_parent_h, page_coord, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, polygons_of_images, contours_tables, polygons_of_drop_capitals, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_h, slopes_marginals, cont_page, polygons_lines_xml) + pcgts = self.writer.build_pagexml_full_layout(contours_only_text_parent, contours_only_text_parent_h, page_coord, order_text_new, id_of_texts_tot, all_found_textline_polygons, all_found_textline_polygons_h, all_box_coord, all_box_coord_h, polygons_of_images, contours_tables, polygons_of_drop_capitals, polygons_of_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_h, slopes_marginals, cont_page, polygons_lines_xml) self.logger.info("Job done in %.1fs", time.time() - t0) ##return pcgts else: @@ -3101,7 +3101,7 @@ class Eynollah: else: contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered)[index_by_text_par_con]) order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h, boxes_d, textline_mask_tot_d) - pcgts = self.writer.build_pagexml_no_full_layout(txt_con_org, page_coord, order_text_new, id_of_texts_tot, all_found_texline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, cont_page, polygons_lines_xml, contours_tables) + pcgts = self.writer.build_pagexml_no_full_layout(txt_con_org, page_coord, order_text_new, id_of_texts_tot, all_found_textline_polygons, all_box_coord, polygons_of_images, polygons_of_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, cont_page, polygons_lines_xml, contours_tables) self.logger.info("Job done in %.1fs", time.time() - t0) ##return pcgts self.writer.write_pagexml(pcgts) diff --git a/qurator/eynollah/utils/__init__.py b/qurator/eynollah/utils/__init__.py index b85abdf..d2b2488 100644 --- a/qurator/eynollah/utils/__init__.py +++ b/qurator/eynollah/utils/__init__.py @@ -796,7 +796,7 @@ def putt_bb_of_drop_capitals_of_model_in_patches_in_layout(layout_in_patch): return layout_in_patch -def check_any_text_region_in_model_one_is_main_or_header(regions_model_1,regions_model_full,contours_only_text_parent,all_box_coord,all_found_texline_polygons,slopes,contours_only_text_parent_d_ordered): +def check_any_text_region_in_model_one_is_main_or_header(regions_model_1,regions_model_full,contours_only_text_parent,all_box_coord,all_found_textline_polygons,slopes,contours_only_text_parent_d_ordered): cx_main,cy_main ,x_min_main , x_max_main, y_min_main ,y_max_main,y_corr_x_min_from_argmin=find_new_features_of_contours(contours_only_text_parent) @@ -805,8 +805,8 @@ def check_any_text_region_in_model_one_is_main_or_header(regions_model_1,regions - all_found_texline_polygons_main=[] - all_found_texline_polygons_head=[] + all_found_textline_polygons_main=[] + all_found_textline_polygons_head=[] all_box_coord_main=[] all_box_coord_head=[] @@ -840,7 +840,7 @@ def check_any_text_region_in_model_one_is_main_or_header(regions_model_1,regions contours_only_text_parent_head_d.append(contours_only_text_parent_d_ordered[ii]) all_box_coord_head.append(all_box_coord[ii]) slopes_head.append(slopes[ii]) - all_found_texline_polygons_head.append(all_found_texline_polygons[ii]) + all_found_textline_polygons_head.append(all_found_textline_polygons[ii]) else: regions_model_1[:,:][(regions_model_1[:,:]==1) & (img[:,:,0]==255) ]=1 contours_only_text_parent_main.append(con) @@ -848,14 +848,14 @@ def check_any_text_region_in_model_one_is_main_or_header(regions_model_1,regions contours_only_text_parent_main_d.append(contours_only_text_parent_d_ordered[ii]) all_box_coord_main.append(all_box_coord[ii]) slopes_main.append(slopes[ii]) - all_found_texline_polygons_main.append(all_found_texline_polygons[ii]) + all_found_textline_polygons_main.append(all_found_textline_polygons[ii]) #print(all_pixels,pixels_main,pixels_header) - return regions_model_1,contours_only_text_parent_main,contours_only_text_parent_head,all_box_coord_main,all_box_coord_head,all_found_texline_polygons_main,all_found_texline_polygons_head,slopes_main,slopes_head,contours_only_text_parent_main_d,contours_only_text_parent_head_d + return regions_model_1,contours_only_text_parent_main,contours_only_text_parent_head,all_box_coord_main,all_box_coord_head,all_found_textline_polygons_main,all_found_textline_polygons_head,slopes_main,slopes_head,contours_only_text_parent_main_d,contours_only_text_parent_head_d -def check_any_text_region_in_model_one_is_main_or_header_light(regions_model_1,regions_model_full,contours_only_text_parent,all_box_coord,all_found_texline_polygons,slopes,contours_only_text_parent_d_ordered): +def check_any_text_region_in_model_one_is_main_or_header_light(regions_model_1,regions_model_full,contours_only_text_parent,all_box_coord,all_found_textline_polygons,slopes,contours_only_text_parent_d_ordered): ### to make it faster h_o = regions_model_1.shape[0] @@ -874,8 +874,8 @@ def check_any_text_region_in_model_one_is_main_or_header_light(regions_model_1,r - all_found_texline_polygons_main=[] - all_found_texline_polygons_head=[] + all_found_textline_polygons_main=[] + all_found_textline_polygons_head=[] all_box_coord_main=[] all_box_coord_head=[] @@ -909,7 +909,7 @@ def check_any_text_region_in_model_one_is_main_or_header_light(regions_model_1,r contours_only_text_parent_head_d.append(contours_only_text_parent_d_ordered[ii]) all_box_coord_head.append(all_box_coord[ii]) slopes_head.append(slopes[ii]) - all_found_texline_polygons_head.append(all_found_texline_polygons[ii]) + all_found_textline_polygons_head.append(all_found_textline_polygons[ii]) else: regions_model_1[:,:][(regions_model_1[:,:]==1) & (img[:,:,0]==255) ]=1 contours_only_text_parent_main.append(con) @@ -917,7 +917,7 @@ def check_any_text_region_in_model_one_is_main_or_header_light(regions_model_1,r contours_only_text_parent_main_d.append(contours_only_text_parent_d_ordered[ii]) all_box_coord_main.append(all_box_coord[ii]) slopes_main.append(slopes[ii]) - all_found_texline_polygons_main.append(all_found_texline_polygons[ii]) + all_found_textline_polygons_main.append(all_found_textline_polygons[ii]) #print(all_pixels,pixels_main,pixels_header) @@ -931,7 +931,7 @@ def check_any_text_region_in_model_one_is_main_or_header_light(regions_model_1,r contours_only_text_parent_main = [ (i*3.).astype(np.int32) for i in contours_only_text_parent_main] ### - return regions_model_1,contours_only_text_parent_main,contours_only_text_parent_head,all_box_coord_main,all_box_coord_head,all_found_texline_polygons_main,all_found_texline_polygons_head,slopes_main,slopes_head,contours_only_text_parent_main_d,contours_only_text_parent_head_d + return regions_model_1,contours_only_text_parent_main,contours_only_text_parent_head,all_box_coord_main,all_box_coord_head,all_found_textline_polygons_main,all_found_textline_polygons_head,slopes_main,slopes_head,contours_only_text_parent_main_d,contours_only_text_parent_head_d def small_textlines_to_parent_adherence2(textlines_con, textline_iamge, num_col): # print(textlines_con) diff --git a/qurator/eynollah/utils/counter.py b/qurator/eynollah/utils/counter.py index bc1d765..9a3ed70 100644 --- a/qurator/eynollah/utils/counter.py +++ b/qurator/eynollah/utils/counter.py @@ -7,13 +7,13 @@ class EynollahIdCounter(): def __init__(self, region_idx=0, line_idx=0): self._counter = Counter() - self._inital_region_idx = region_idx - self._inital_line_idx = line_idx + self._initial_region_idx = region_idx + self._initial_line_idx = line_idx self.reset() def reset(self): - self.set('region', self._inital_region_idx) - self.set('line', self._inital_line_idx) + self.set('region', self._initial_region_idx) + self.set('line', self._initial_line_idx) def inc(self, name, val=1): self._counter.update({name: val}) diff --git a/qurator/eynollah/utils/drop_capitals.py b/qurator/eynollah/utils/drop_capitals.py index 6d1edfa..e12028f 100644 --- a/qurator/eynollah/utils/drop_capitals.py +++ b/qurator/eynollah/utils/drop_capitals.py @@ -13,13 +13,13 @@ def adhere_drop_capital_region_into_corresponding_textline( contours_only_text_parent_h, all_box_coord, all_box_coord_h, - all_found_texline_polygons, - all_found_texline_polygons_h, + all_found_textline_polygons, + all_found_textline_polygons_h, kernel=None, curved_line=False, ): - # print(np.shape(all_found_texline_polygons),np.shape(all_found_texline_polygons[3]),'all_found_texline_polygonsshape') - # print(all_found_texline_polygons[3]) + # print(np.shape(all_found_textline_polygons),np.shape(all_found_textline_polygons[3]),'all_found_textline_polygonsshape') + # print(all_found_textline_polygons[3]) cx_m, cy_m, _, _, _, _, _ = find_new_features_of_contours(contours_only_text_parent) cx_h, cy_h, _, _, _, _, _ = find_new_features_of_contours(contours_only_text_parent_h) cx_d, cy_d, _, _, y_min_d, y_max_d, _ = find_new_features_of_contours(polygons_of_drop_capitals) @@ -87,9 +87,9 @@ def adhere_drop_capital_region_into_corresponding_textline( region_final = region_with_intersected_drop[np.argmax(sum_pixels_of_intersection)] - 1 # print(region_final,'region_final') - # cx_t,cy_t ,_, _, _ ,_,_= find_new_features_of_contours(all_found_texline_polygons[int(region_final)]) + # cx_t,cy_t ,_, _, _ ,_,_= find_new_features_of_contours(all_found_textline_polygons[int(region_final)]) try: - cx_t, cy_t, _, _, _, _, _ = find_new_features_of_contours(all_found_texline_polygons[int(region_final)]) + cx_t, cy_t, _, _, _, _, _ = find_new_features_of_contours(all_found_textline_polygons[int(region_final)]) # print(all_box_coord[j_cont]) # print(cx_t) # print(cy_t) @@ -105,9 +105,9 @@ def adhere_drop_capital_region_into_corresponding_textline( arg_min = np.argmin(np.abs(y_lines - y_min_d[i_drop])) # print(arg_min) - cnt_nearest = np.copy(all_found_texline_polygons[int(region_final)][arg_min]) - cnt_nearest[:, 0, 0] = all_found_texline_polygons[int(region_final)][arg_min][:, 0, 0] # +all_box_coord[int(region_final)][2] - cnt_nearest[:, 0, 1] = all_found_texline_polygons[int(region_final)][arg_min][:, 0, 1] # +all_box_coord[int(region_final)][0] + cnt_nearest = np.copy(all_found_textline_polygons[int(region_final)][arg_min]) + cnt_nearest[:, 0, 0] = all_found_textline_polygons[int(region_final)][arg_min][:, 0, 0] # +all_box_coord[int(region_final)][2] + cnt_nearest[:, 0, 1] = all_found_textline_polygons[int(region_final)][arg_min][:, 0, 1] # +all_box_coord[int(region_final)][0] img_textlines = np.zeros((text_regions_p.shape[0], text_regions_p.shape[1], 3)) img_textlines = cv2.fillPoly(img_textlines, pts=[cnt_nearest], color=(255, 255, 255)) @@ -131,7 +131,7 @@ def adhere_drop_capital_region_into_corresponding_textline( # contours_biggest=contours_biggest.reshape(np.shape(contours_biggest)[0],np.shape(contours_biggest)[2]) - all_found_texline_polygons[int(region_final)][arg_min] = contours_biggest + all_found_textline_polygons[int(region_final)][arg_min] = contours_biggest except: # print('gordun1') @@ -139,11 +139,11 @@ def adhere_drop_capital_region_into_corresponding_textline( elif len(region_with_intersected_drop) == 1: region_final = region_with_intersected_drop[0] - 1 - # areas_main=np.array([cv2.contourArea(all_found_texline_polygons[int(region_final)][0][j] ) for j in range(len(all_found_texline_polygons[int(region_final)]))]) + # areas_main=np.array([cv2.contourArea(all_found_textline_polygons[int(region_final)][0][j] ) for j in range(len(all_found_textline_polygons[int(region_final)]))]) - # cx_t,cy_t ,_, _, _ ,_,_= find_new_features_of_contours(all_found_texline_polygons[int(region_final)]) + # cx_t,cy_t ,_, _, _ ,_,_= find_new_features_of_contours(all_found_textline_polygons[int(region_final)]) try: - cx_t, cy_t, _, _, _, _, _ = find_new_features_of_contours(all_found_texline_polygons[int(region_final)]) + cx_t, cy_t, _, _, _, _, _ = find_new_features_of_contours(all_found_textline_polygons[int(region_final)]) # print(all_box_coord[j_cont]) # print(cx_t) # print(cy_t) @@ -157,9 +157,9 @@ def adhere_drop_capital_region_into_corresponding_textline( arg_min = np.argmin(np.abs(y_lines - y_min_d[i_drop])) # print(arg_min) - cnt_nearest = np.copy(all_found_texline_polygons[int(region_final)][arg_min]) - cnt_nearest[:, 0, 0] = all_found_texline_polygons[int(region_final)][arg_min][:, 0, 0] # +all_box_coord[int(region_final)][2] - cnt_nearest[:, 0, 1] = all_found_texline_polygons[int(region_final)][arg_min][:, 0, 1] # +all_box_coord[int(region_final)][0] + cnt_nearest = np.copy(all_found_textline_polygons[int(region_final)][arg_min]) + cnt_nearest[:, 0, 0] = all_found_textline_polygons[int(region_final)][arg_min][:, 0, 0] # +all_box_coord[int(region_final)][2] + cnt_nearest[:, 0, 1] = all_found_textline_polygons[int(region_final)][arg_min][:, 0, 1] # +all_box_coord[int(region_final)][0] img_textlines = np.zeros((text_regions_p.shape[0], text_regions_p.shape[1], 3)) img_textlines = cv2.fillPoly(img_textlines, pts=[cnt_nearest], color=(255, 255, 255)) @@ -184,15 +184,15 @@ def adhere_drop_capital_region_into_corresponding_textline( # contours_biggest[:,0,0]=contours_biggest[:,0,0]#-all_box_coord[int(region_final)][2] # contours_biggest[:,0,1]=contours_biggest[:,0,1]#-all_box_coord[int(region_final)][0] # print(np.shape(contours_biggest),'contours_biggest') - # print(np.shape(all_found_texline_polygons[int(region_final)][arg_min])) + # print(np.shape(all_found_textline_polygons[int(region_final)][arg_min])) ##contours_biggest=contours_biggest.reshape(np.shape(contours_biggest)[0],np.shape(contours_biggest)[2]) - all_found_texline_polygons[int(region_final)][arg_min] = contours_biggest + all_found_textline_polygons[int(region_final)][arg_min] = contours_biggest except: pass try: - # print(all_found_texline_polygons[j_cont][0]) - cx_t, cy_t, _, _, _, _, _ = find_new_features_of_contours(all_found_texline_polygons[int(region_final)]) + # print(all_found_textline_polygons[j_cont][0]) + cx_t, cy_t, _, _, _, _, _ = find_new_features_of_contours(all_found_textline_polygons[int(region_final)]) # print(all_box_coord[j_cont]) # print(cx_t) # print(cy_t) @@ -206,9 +206,9 @@ def adhere_drop_capital_region_into_corresponding_textline( arg_min = np.argmin(np.abs(y_lines - y_min_d[i_drop])) # print(arg_min) - cnt_nearest = np.copy(all_found_texline_polygons[int(region_final)][arg_min]) - cnt_nearest[:, 0, 0] = all_found_texline_polygons[int(region_final)][arg_min][:, 0, 0] # +all_box_coord[int(region_final)][2] - cnt_nearest[:, 0, 1] = all_found_texline_polygons[int(region_final)][arg_min][:, 0, 1] # +all_box_coord[int(region_final)][0] + cnt_nearest = np.copy(all_found_textline_polygons[int(region_final)][arg_min]) + cnt_nearest[:, 0, 0] = all_found_textline_polygons[int(region_final)][arg_min][:, 0, 0] # +all_box_coord[int(region_final)][2] + cnt_nearest[:, 0, 1] = all_found_textline_polygons[int(region_final)][arg_min][:, 0, 1] # +all_box_coord[int(region_final)][0] img_textlines = np.zeros((text_regions_p.shape[0], text_regions_p.shape[1], 3)) img_textlines = cv2.fillPoly(img_textlines, pts=[cnt_nearest], color=(255, 255, 255)) @@ -231,15 +231,15 @@ def adhere_drop_capital_region_into_corresponding_textline( contours_biggest[:, 0, 1] = contours_biggest[:, 0, 1] # -all_box_coord[int(region_final)][0] ##contours_biggest=contours_biggest.reshape(np.shape(contours_biggest)[0],np.shape(contours_biggest)[2]) - all_found_texline_polygons[int(region_final)][arg_min] = contours_biggest - # all_found_texline_polygons[int(region_final)][arg_min]=contours_biggest + all_found_textline_polygons[int(region_final)][arg_min] = contours_biggest + # all_found_textline_polygons[int(region_final)][arg_min]=contours_biggest except: pass else: pass - ##cx_t,cy_t ,_, _, _ ,_,_= find_new_features_of_contours(all_found_texline_polygons[int(region_final)]) + ##cx_t,cy_t ,_, _, _ ,_,_= find_new_features_of_contours(all_found_textline_polygons[int(region_final)]) ###print(all_box_coord[j_cont]) ###print(cx_t) ###print(cy_t) @@ -253,9 +253,9 @@ def adhere_drop_capital_region_into_corresponding_textline( ##arg_min=np.argmin(np.abs(y_lines-y_min_d[i_drop]) ) ###print(arg_min) - ##cnt_nearest=np.copy(all_found_texline_polygons[int(region_final)][arg_min]) - ##cnt_nearest[:,0,0]=all_found_texline_polygons[int(region_final)][arg_min][:,0,0]#+all_box_coord[int(region_final)][2] - ##cnt_nearest[:,0,1]=all_found_texline_polygons[int(region_final)][arg_min][:,0,1]#+all_box_coord[int(region_final)][0] + ##cnt_nearest=np.copy(all_found_textline_polygons[int(region_final)][arg_min]) + ##cnt_nearest[:,0,0]=all_found_textline_polygons[int(region_final)][arg_min][:,0,0]#+all_box_coord[int(region_final)][2] + ##cnt_nearest[:,0,1]=all_found_textline_polygons[int(region_final)][arg_min][:,0,1]#+all_box_coord[int(region_final)][0] ##img_textlines=np.zeros((text_regions_p.shape[0],text_regions_p.shape[1],3)) ##img_textlines=cv2.fillPoly(img_textlines,pts=[cnt_nearest],color=(255,255,255)) @@ -281,7 +281,7 @@ def adhere_drop_capital_region_into_corresponding_textline( ##contours_biggest[:,0,1]=contours_biggest[:,0,1]#-all_box_coord[int(region_final)][0] ##contours_biggest=contours_biggest.reshape(np.shape(contours_biggest)[0],np.shape(contours_biggest)[2]) - ##all_found_texline_polygons[int(region_final)][arg_min]=contours_biggest + ##all_found_textline_polygons[int(region_final)][arg_min]=contours_biggest else: if len(region_with_intersected_drop) > 1: @@ -293,9 +293,9 @@ def adhere_drop_capital_region_into_corresponding_textline( region_final = region_with_intersected_drop[np.argmax(sum_pixels_of_intersection)] - 1 # print(region_final,'region_final') - # cx_t,cy_t ,_, _, _ ,_,_= find_new_features_of_contours(all_found_texline_polygons[int(region_final)]) + # cx_t,cy_t ,_, _, _ ,_,_= find_new_features_of_contours(all_found_textline_polygons[int(region_final)]) try: - cx_t, cy_t, _, _, _, _, _ = find_new_features_of_contours(all_found_texline_polygons[int(region_final)]) + cx_t, cy_t, _, _, _, _, _ = find_new_features_of_contours(all_found_textline_polygons[int(region_final)]) # print(all_box_coord[j_cont]) # print(cx_t) # print(cy_t) @@ -311,9 +311,9 @@ def adhere_drop_capital_region_into_corresponding_textline( arg_min = np.argmin(np.abs(y_lines - y_min_d[i_drop])) # print(arg_min) - cnt_nearest = np.copy(all_found_texline_polygons[int(region_final)][arg_min]) - cnt_nearest[:, 0] = all_found_texline_polygons[int(region_final)][arg_min][:, 0] + all_box_coord[int(region_final)][2] - cnt_nearest[:, 1] = all_found_texline_polygons[int(region_final)][arg_min][:, 1] + all_box_coord[int(region_final)][0] + cnt_nearest = np.copy(all_found_textline_polygons[int(region_final)][arg_min]) + cnt_nearest[:, 0] = all_found_textline_polygons[int(region_final)][arg_min][:, 0] + all_box_coord[int(region_final)][2] + cnt_nearest[:, 1] = all_found_textline_polygons[int(region_final)][arg_min][:, 1] + all_box_coord[int(region_final)][0] img_textlines = np.zeros((text_regions_p.shape[0], text_regions_p.shape[1], 3)) img_textlines = cv2.fillPoly(img_textlines, pts=[cnt_nearest], color=(255, 255, 255)) @@ -337,7 +337,7 @@ def adhere_drop_capital_region_into_corresponding_textline( contours_biggest = contours_biggest.reshape(np.shape(contours_biggest)[0], np.shape(contours_biggest)[2]) - all_found_texline_polygons[int(region_final)][arg_min] = contours_biggest + all_found_textline_polygons[int(region_final)][arg_min] = contours_biggest except: # print('gordun1') @@ -345,14 +345,14 @@ def adhere_drop_capital_region_into_corresponding_textline( elif len(region_with_intersected_drop) == 1: region_final = region_with_intersected_drop[0] - 1 - # areas_main=np.array([cv2.contourArea(all_found_texline_polygons[int(region_final)][0][j] ) for j in range(len(all_found_texline_polygons[int(region_final)]))]) + # areas_main=np.array([cv2.contourArea(all_found_textline_polygons[int(region_final)][0][j] ) for j in range(len(all_found_textline_polygons[int(region_final)]))]) - # cx_t,cy_t ,_, _, _ ,_,_= find_new_features_of_contours(all_found_texline_polygons[int(region_final)]) + # cx_t,cy_t ,_, _, _ ,_,_= find_new_features_of_contours(all_found_textline_polygons[int(region_final)]) # print(cx_t,'print') try: - # print(all_found_texline_polygons[j_cont][0]) - cx_t, cy_t, _, _, _, _, _ = find_new_features_of_contours(all_found_texline_polygons[int(region_final)]) + # print(all_found_textline_polygons[j_cont][0]) + cx_t, cy_t, _, _, _, _, _ = find_new_features_of_contours(all_found_textline_polygons[int(region_final)]) # print(all_box_coord[j_cont]) # print(cx_t) # print(cy_t) @@ -366,9 +366,9 @@ def adhere_drop_capital_region_into_corresponding_textline( arg_min = np.argmin(np.abs(y_lines - y_min_d[i_drop])) # print(arg_min) - cnt_nearest = np.copy(all_found_texline_polygons[int(region_final)][arg_min]) - cnt_nearest[:, 0] = all_found_texline_polygons[int(region_final)][arg_min][:, 0] + all_box_coord[int(region_final)][2] - cnt_nearest[:, 1] = all_found_texline_polygons[int(region_final)][arg_min][:, 1] + all_box_coord[int(region_final)][0] + cnt_nearest = np.copy(all_found_textline_polygons[int(region_final)][arg_min]) + cnt_nearest[:, 0] = all_found_textline_polygons[int(region_final)][arg_min][:, 0] + all_box_coord[int(region_final)][2] + cnt_nearest[:, 1] = all_found_textline_polygons[int(region_final)][arg_min][:, 1] + all_box_coord[int(region_final)][0] img_textlines = np.zeros((text_regions_p.shape[0], text_regions_p.shape[1], 3)) img_textlines = cv2.fillPoly(img_textlines, pts=[cnt_nearest], color=(255, 255, 255)) @@ -391,8 +391,8 @@ def adhere_drop_capital_region_into_corresponding_textline( contours_biggest[:, 0, 1] = contours_biggest[:, 0, 1] - all_box_coord[int(region_final)][0] contours_biggest = contours_biggest.reshape(np.shape(contours_biggest)[0], np.shape(contours_biggest)[2]) - all_found_texline_polygons[int(region_final)][arg_min] = contours_biggest - # all_found_texline_polygons[int(region_final)][arg_min]=contours_biggest + all_found_textline_polygons[int(region_final)][arg_min] = contours_biggest + # all_found_textline_polygons[int(region_final)][arg_min]=contours_biggest except: pass @@ -417,8 +417,8 @@ def adhere_drop_capital_region_into_corresponding_textline( ######plt.show() #####try: #####if len(contours_new_parent)==1: - ######print(all_found_texline_polygons[j_cont][0]) - #####cx_t,cy_t ,_, _, _ ,_,_= find_new_features_of_contours(all_found_texline_polygons[j_cont]) + ######print(all_found_textline_polygons[j_cont][0]) + #####cx_t,cy_t ,_, _, _ ,_,_= find_new_features_of_contours(all_found_textline_polygons[j_cont]) ######print(all_box_coord[j_cont]) ######print(cx_t) ######print(cy_t) @@ -431,9 +431,9 @@ def adhere_drop_capital_region_into_corresponding_textline( #####arg_min=np.argmin(np.abs(y_lines-y_min_d[i_drop]) ) ######print(arg_min) - #####cnt_nearest=np.copy(all_found_texline_polygons[j_cont][arg_min]) - #####cnt_nearest[:,0]=all_found_texline_polygons[j_cont][arg_min][:,0]+all_box_coord[j_cont][2] - #####cnt_nearest[:,1]=all_found_texline_polygons[j_cont][arg_min][:,1]+all_box_coord[j_cont][0] + #####cnt_nearest=np.copy(all_found_textline_polygons[j_cont][arg_min]) + #####cnt_nearest[:,0]=all_found_textline_polygons[j_cont][arg_min][:,0]+all_box_coord[j_cont][2] + #####cnt_nearest[:,1]=all_found_textline_polygons[j_cont][arg_min][:,1]+all_box_coord[j_cont][0] #####img_textlines=np.zeros((text_regions_p.shape[0],text_regions_p.shape[1],3)) #####img_textlines=cv2.fillPoly(img_textlines,pts=[cnt_nearest],color=(255,255,255)) @@ -454,7 +454,7 @@ def adhere_drop_capital_region_into_corresponding_textline( #####contours_biggest[:,0,0]=contours_biggest[:,0,0]-all_box_coord[j_cont][2] #####contours_biggest[:,0,1]=contours_biggest[:,0,1]-all_box_coord[j_cont][0] - #####all_found_texline_polygons[j_cont][arg_min]=contours_biggest + #####all_found_textline_polygons[j_cont][arg_min]=contours_biggest ######print(contours_biggest) ######plt.imshow(img_textlines[:,:,0]) ######plt.show() @@ -462,7 +462,7 @@ def adhere_drop_capital_region_into_corresponding_textline( #####pass #####except: #####pass - return all_found_texline_polygons + return all_found_textline_polygons def filter_small_drop_capitals_from_no_patch_layout(layout_no_patch, layout1): diff --git a/qurator/eynollah/writer.py b/qurator/eynollah/writer.py index d5704f6..f537f65 100644 --- a/qurator/eynollah/writer.py +++ b/qurator/eynollah/writer.py @@ -54,54 +54,54 @@ class EynollahXmlWriter(): points_page_print = points_page_print + ' ' return points_page_print[:-1] - def serialize_lines_in_marginal(self, marginal_region, all_found_texline_polygons_marginals, marginal_idx, page_coord, all_box_coord_marginals, slopes_marginals, counter): - for j in range(len(all_found_texline_polygons_marginals[marginal_idx])): + def serialize_lines_in_marginal(self, marginal_region, all_found_textline_polygons_marginals, marginal_idx, page_coord, all_box_coord_marginals, slopes_marginals, counter): + for j in range(len(all_found_textline_polygons_marginals[marginal_idx])): coords = CoordsType() textline = TextLineType(id=counter.next_line_id, Coords=coords) marginal_region.add_TextLine(textline) points_co = '' - for l in range(len(all_found_texline_polygons_marginals[marginal_idx][j])): + for l in range(len(all_found_textline_polygons_marginals[marginal_idx][j])): if not (self.curved_line or self.textline_light): - if len(all_found_texline_polygons_marginals[marginal_idx][j][l]) == 2: - textline_x_coord = max(0, int((all_found_texline_polygons_marginals[marginal_idx][j][l][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x) ) - textline_y_coord = max(0, int((all_found_texline_polygons_marginals[marginal_idx][j][l][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0]) / self.scale_y) ) + if len(all_found_textline_polygons_marginals[marginal_idx][j][l]) == 2: + textline_x_coord = max(0, int((all_found_textline_polygons_marginals[marginal_idx][j][l][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x) ) + textline_y_coord = max(0, int((all_found_textline_polygons_marginals[marginal_idx][j][l][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0]) / self.scale_y) ) else: - textline_x_coord = max(0, int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x) ) - textline_y_coord = max(0, int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0]) / self.scale_y) ) + textline_x_coord = max(0, int((all_found_textline_polygons_marginals[marginal_idx][j][l][0][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x) ) + textline_y_coord = max(0, int((all_found_textline_polygons_marginals[marginal_idx][j][l][0][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0]) / self.scale_y) ) points_co += str(textline_x_coord) points_co += ',' points_co += str(textline_y_coord) if (self.curved_line or self.textline_light) and np.abs(slopes_marginals[marginal_idx]) <= 45: - if len(all_found_texline_polygons_marginals[marginal_idx][j][l]) == 2: - points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0] + page_coord[2]) / self.scale_x)) + if len(all_found_textline_polygons_marginals[marginal_idx][j][l]) == 2: + points_co += str(int((all_found_textline_polygons_marginals[marginal_idx][j][l][0] + page_coord[2]) / self.scale_x)) points_co += ',' - points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][1] + page_coord[0]) / self.scale_y)) + points_co += str(int((all_found_textline_polygons_marginals[marginal_idx][j][l][1] + page_coord[0]) / self.scale_y)) else: - points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][0] + page_coord[2]) / self.scale_x)) + points_co += str(int((all_found_textline_polygons_marginals[marginal_idx][j][l][0][0] + page_coord[2]) / self.scale_x)) points_co += ',' - points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][1] + page_coord[0]) / self.scale_y)) + points_co += str(int((all_found_textline_polygons_marginals[marginal_idx][j][l][0][1] + page_coord[0]) / self.scale_y)) elif (self.curved_line or self.textline_light) and np.abs(slopes_marginals[marginal_idx]) > 45: - if len(all_found_texline_polygons_marginals[marginal_idx][j][l]) == 2: - points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x)) + if len(all_found_textline_polygons_marginals[marginal_idx][j][l]) == 2: + points_co += str(int((all_found_textline_polygons_marginals[marginal_idx][j][l][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x)) points_co += ',' - points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0]) / self.scale_y)) + points_co += str(int((all_found_textline_polygons_marginals[marginal_idx][j][l][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0]) / self.scale_y)) else: - points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x)) + points_co += str(int((all_found_textline_polygons_marginals[marginal_idx][j][l][0][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x)) points_co += ',' - points_co += str(int((all_found_texline_polygons_marginals[marginal_idx][j][l][0][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0]) / self.scale_y)) + points_co += str(int((all_found_textline_polygons_marginals[marginal_idx][j][l][0][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0]) / self.scale_y)) points_co += ' ' coords.set_points(points_co[:-1]) - def serialize_lines_in_region(self, text_region, all_found_texline_polygons, region_idx, page_coord, all_box_coord, slopes, counter): + def serialize_lines_in_region(self, text_region, all_found_textline_polygons, region_idx, page_coord, all_box_coord, slopes, counter): self.logger.debug('enter serialize_lines_in_region') - for j in range(len(all_found_texline_polygons[region_idx])): + for j in range(len(all_found_textline_polygons[region_idx])): coords = CoordsType() textline = TextLineType(id=counter.next_line_id, Coords=coords) text_region.add_TextLine(textline) region_bboxes = all_box_coord[region_idx] points_co = '' - for idx_contour_textline, contour_textline in enumerate(all_found_texline_polygons[region_idx][j]): + for idx_contour_textline, contour_textline in enumerate(all_found_textline_polygons[region_idx][j]): if not (self.curved_line or self.textline_light): if len(contour_textline) == 2: textline_x_coord = max(0, int((contour_textline[0] + region_bboxes[2] + page_coord[2]) / self.scale_x)) @@ -140,7 +140,7 @@ class EynollahXmlWriter(): with open(out_fname, 'w') as f: f.write(to_xml(pcgts)) - def build_pagexml_no_full_layout(self, found_polygons_text_region, page_coord, order_of_texts, id_of_texts, all_found_texline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, cont_page, polygons_lines_to_be_written_in_xml, found_polygons_tables): + def build_pagexml_no_full_layout(self, found_polygons_text_region, page_coord, order_of_texts, id_of_texts, all_found_textline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, cont_page, polygons_lines_to_be_written_in_xml, found_polygons_tables): self.logger.debug('enter build_pagexml_no_full_layout') # create the file structure @@ -159,13 +159,13 @@ class EynollahXmlWriter(): Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region[mm], page_coord)), ) page.add_TextRegion(textregion) - self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, counter) + self.serialize_lines_in_region(textregion, all_found_textline_polygons, mm, page_coord, all_box_coord, slopes, counter) for mm in range(len(found_polygons_marginals)): marginal = TextRegionType(id=counter.next_region_id, type_='marginalia', Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_marginals[mm], page_coord))) page.add_TextRegion(marginal) - self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, counter) + self.serialize_lines_in_marginal(marginal, all_found_textline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, counter) for mm in range(len(found_polygons_text_region_img)): img_region = ImageRegionType(id=counter.next_region_id, Coords=CoordsType()) @@ -201,7 +201,7 @@ class EynollahXmlWriter(): return pcgts - def build_pagexml_full_layout(self, found_polygons_text_region, found_polygons_text_region_h, page_coord, order_of_texts, id_of_texts, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_h, slopes_marginals, cont_page, polygons_lines_to_be_written_in_xml): + def build_pagexml_full_layout(self, found_polygons_text_region, found_polygons_text_region_h, page_coord, order_of_texts, id_of_texts, all_found_textline_polygons, all_found_textline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, found_polygons_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_h, slopes_marginals, cont_page, polygons_lines_to_be_written_in_xml): self.logger.debug('enter build_pagexml_full_layout') # create the file structure @@ -218,20 +218,20 @@ class EynollahXmlWriter(): textregion = TextRegionType(id=counter.next_region_id, type_='paragraph', Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region[mm], page_coord))) page.add_TextRegion(textregion) - self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, counter) + self.serialize_lines_in_region(textregion, all_found_textline_polygons, mm, page_coord, all_box_coord, slopes, counter) self.logger.debug('len(found_polygons_text_region_h) %s', len(found_polygons_text_region_h)) for mm in range(len(found_polygons_text_region_h)): textregion = TextRegionType(id=counter.next_region_id, type_='header', Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region_h[mm], page_coord))) page.add_TextRegion(textregion) - self.serialize_lines_in_region(textregion, all_found_texline_polygons_h, mm, page_coord, all_box_coord_h, slopes_h, counter) + self.serialize_lines_in_region(textregion, all_found_textline_polygons_h, mm, page_coord, all_box_coord_h, slopes_h, counter) for mm in range(len(found_polygons_marginals)): marginal = TextRegionType(id=counter.next_region_id, type_='marginalia', Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_marginals[mm], page_coord))) page.add_TextRegion(marginal) - self.serialize_lines_in_marginal(marginal, all_found_texline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, counter) + self.serialize_lines_in_marginal(marginal, all_found_textline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, counter) for mm in range(len(found_polygons_drop_capitals)): page.add_TextRegion(TextRegionType(id=counter.next_region_id, type_='drop-capital',