diff --git a/qurator/eynollah/eynollah.py b/qurator/eynollah/eynollah.py index f183dee..1bb0eff 100644 --- a/qurator/eynollah/eynollah.py +++ b/qurator/eynollah/eynollah.py @@ -3735,9 +3735,9 @@ class Eynollah: contours_only_text_parent = [c for jz, c in enumerate(contours_only_text_parent) if areas_cnt_text[jz] > MIN_AREA_REGION] areas_cnt_text_parent = [area for area in areas_cnt_text if area > MIN_AREA_REGION] index_con_parents = np.argsort(areas_cnt_text_parent) - if len(contours_only_text_parent)>1: + try: contours_only_text_parent = list(np.array(contours_only_text_parent,dtype=object)[index_con_parents]) - else: + except: contours_only_text_parent = list(np.array(contours_only_text_parent,dtype=np.int32)[index_con_parents]) areas_cnt_text_parent = list(np.array(areas_cnt_text_parent)[index_con_parents]) @@ -3753,10 +3753,11 @@ class Eynollah: if len(areas_cnt_text_d)>0: contours_biggest_d = contours_only_text_parent_d[np.argmax(areas_cnt_text_d)] index_con_parents_d = np.argsort(areas_cnt_text_d) - if len(contours_only_text_parent_d)>1: + try: contours_only_text_parent_d = list(np.array(contours_only_text_parent_d,dtype=object)[index_con_parents_d]) - else: + except: contours_only_text_parent_d = list(np.array(contours_only_text_parent_d,dtype=np.int32)[index_con_parents_d]) + areas_cnt_text_d = list(np.array(areas_cnt_text_d)[index_con_parents_d]) cx_bigest_d_big, cy_biggest_d_big, _, _, _, _, _ = find_new_features_of_contours([contours_biggest_d]) @@ -3819,9 +3820,9 @@ class Eynollah: areas_cnt_text_parent = [area for area in areas_cnt_text if area > MIN_AREA_REGION] index_con_parents = np.argsort(areas_cnt_text_parent) - if len(contours_only_text_parent)>1: + try: contours_only_text_parent = list(np.array(contours_only_text_parent,dtype=object)[index_con_parents]) - else: + except: contours_only_text_parent = list(np.array(contours_only_text_parent,dtype=np.int32)[index_con_parents]) areas_cnt_text_parent = list(np.array(areas_cnt_text_parent)[index_con_parents]) @@ -3864,10 +3865,10 @@ class Eynollah: #print("text region early 6 in %.1fs", time.time() - t0) if self.full_layout: if np.abs(slope_deskew) >= SLOPE_THRESHOLD: - if len(contours_only_text_parent_d_ordered)>1: - contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered, dtype=object)[index_by_text_par_con]) - else: + try: contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered, dtype=np.int32)[index_by_text_par_con]) + except: + contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered, dtype=object)[index_by_text_par_con]) if self.light_version: text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_textline_polygons, all_found_textline_polygons_h, slopes, slopes_h, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header_light(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_textline_polygons, slopes, contours_only_text_parent_d_ordered) else: @@ -3957,9 +3958,9 @@ class Eynollah: if np.abs(slope_deskew) < SLOPE_THRESHOLD: order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot) else: - if len(contours_only_text_parent_d_ordered)>1: + try: contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered, dtype=object)[index_by_text_par_con]) - else: + except: contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered, dtype=np.int32)[index_by_text_par_con]) order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h, boxes_d, textline_mask_tot_d) diff --git a/qurator/eynollah/writer.py b/qurator/eynollah/writer.py index 29caddc..8eb1027 100644 --- a/qurator/eynollah/writer.py +++ b/qurator/eynollah/writer.py @@ -136,6 +136,29 @@ class EynollahXmlWriter(): points_co += str(int((contour_textline[0][1] + region_bboxes[0]+page_coord[0])/self.scale_y)) points_co += ' ' coords.set_points(points_co[:-1]) + + def serialize_lines_in_dropcapital(self, text_region, all_found_textline_polygons, region_idx, page_coord, all_box_coord, slopes, counter, ocr_all_textlines_textregion): + self.logger.debug('enter serialize_lines_in_region') + for j in range(1): + coords = CoordsType() + textline = TextLineType(id=counter.next_line_id, Coords=coords) + if ocr_all_textlines_textregion: + textline.set_TextEquiv( [ TextEquivType(Unicode=ocr_all_textlines_textregion[j]) ] ) + text_region.add_TextLine(textline) + #region_bboxes = all_box_coord[region_idx] + points_co = '' + for idx_contour_textline, contour_textline in enumerate(all_found_textline_polygons[j]): + if len(contour_textline) == 2: + points_co += str(int((contour_textline[0] + page_coord[2]) / self.scale_x)) + points_co += ',' + points_co += str(int((contour_textline[1] + page_coord[0]) / self.scale_y)) + else: + points_co += str(int((contour_textline[0][0] + page_coord[2]) / self.scale_x)) + points_co += ',' + points_co += str(int((contour_textline[0][1] + page_coord[0])/self.scale_y)) + + points_co += ' ' + coords.set_points(points_co[:-1]) def write_pagexml(self, pcgts): out_fname = os.path.join(self.dir_out, self.image_filename_stem) + ".xml" @@ -251,8 +274,12 @@ class EynollahXmlWriter(): self.serialize_lines_in_marginal(marginal, all_found_textline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, counter) for mm in range(len(found_polygons_drop_capitals)): - page.add_TextRegion(TextRegionType(id=counter.next_region_id, type_='drop-capital', - Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_drop_capitals[mm], page_coord)))) + dropcapital = TextRegionType(id=counter.next_region_id, type_='drop-capital', + Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_drop_capitals[mm], page_coord))) + page.add_TextRegion(dropcapital) + all_box_coord_drop = None + slopes_drop = None + self.serialize_lines_in_dropcapital(dropcapital, [found_polygons_drop_capitals[mm]], mm, page_coord, all_box_coord_drop, slopes_drop, counter, ocr_all_textlines_textregion=None) for mm in range(len(found_polygons_text_region_img)): page.add_ImageRegion(ImageRegionType(id=counter.next_region_id, Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region_img[mm], page_coord))))