mirror of
				https://github.com/qurator-spk/eynollah.git
				synced 2025-11-03 19:24:13 +01:00 
			
		
		
		
	writing drop capitals in xml output + and may resolve issue #110
This commit is contained in:
		
							parent
							
								
									93005959e5
								
							
						
					
					
						commit
						0f87974b0c
					
				
					 2 changed files with 41 additions and 13 deletions
				
			
		| 
						 | 
				
			
			@ -3735,9 +3735,9 @@ class Eynollah:
 | 
			
		|||
                    contours_only_text_parent = [c for jz, c in enumerate(contours_only_text_parent) if areas_cnt_text[jz] > MIN_AREA_REGION]
 | 
			
		||||
                    areas_cnt_text_parent = [area for area in areas_cnt_text if area > MIN_AREA_REGION]
 | 
			
		||||
                    index_con_parents = np.argsort(areas_cnt_text_parent)
 | 
			
		||||
                    if len(contours_only_text_parent)>1:
 | 
			
		||||
                    try:
 | 
			
		||||
                        contours_only_text_parent = list(np.array(contours_only_text_parent,dtype=object)[index_con_parents])
 | 
			
		||||
                    else:
 | 
			
		||||
                    except:
 | 
			
		||||
                        contours_only_text_parent = list(np.array(contours_only_text_parent,dtype=np.int32)[index_con_parents])
 | 
			
		||||
                    areas_cnt_text_parent = list(np.array(areas_cnt_text_parent)[index_con_parents])
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -3753,10 +3753,11 @@ class Eynollah:
 | 
			
		|||
                    if len(areas_cnt_text_d)>0:
 | 
			
		||||
                        contours_biggest_d = contours_only_text_parent_d[np.argmax(areas_cnt_text_d)]
 | 
			
		||||
                        index_con_parents_d = np.argsort(areas_cnt_text_d)
 | 
			
		||||
                        if len(contours_only_text_parent_d)>1:
 | 
			
		||||
                        try:
 | 
			
		||||
                            contours_only_text_parent_d = list(np.array(contours_only_text_parent_d,dtype=object)[index_con_parents_d])
 | 
			
		||||
                        else:
 | 
			
		||||
                        except:
 | 
			
		||||
                            contours_only_text_parent_d = list(np.array(contours_only_text_parent_d,dtype=np.int32)[index_con_parents_d])
 | 
			
		||||
                            
 | 
			
		||||
                        areas_cnt_text_d = list(np.array(areas_cnt_text_d)[index_con_parents_d])
 | 
			
		||||
 | 
			
		||||
                        cx_bigest_d_big, cy_biggest_d_big, _, _, _, _, _ = find_new_features_of_contours([contours_biggest_d])
 | 
			
		||||
| 
						 | 
				
			
			@ -3819,9 +3820,9 @@ class Eynollah:
 | 
			
		|||
                    areas_cnt_text_parent = [area for area in areas_cnt_text if area > MIN_AREA_REGION]
 | 
			
		||||
 | 
			
		||||
                    index_con_parents = np.argsort(areas_cnt_text_parent)
 | 
			
		||||
                    if len(contours_only_text_parent)>1:
 | 
			
		||||
                    try:
 | 
			
		||||
                        contours_only_text_parent = list(np.array(contours_only_text_parent,dtype=object)[index_con_parents])
 | 
			
		||||
                    else:
 | 
			
		||||
                    except:
 | 
			
		||||
                        contours_only_text_parent = list(np.array(contours_only_text_parent,dtype=np.int32)[index_con_parents])
 | 
			
		||||
                    areas_cnt_text_parent = list(np.array(areas_cnt_text_parent)[index_con_parents])
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -3864,10 +3865,10 @@ class Eynollah:
 | 
			
		|||
            #print("text region early 6 in %.1fs", time.time() - t0)
 | 
			
		||||
            if self.full_layout:
 | 
			
		||||
                if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
 | 
			
		||||
                    if len(contours_only_text_parent_d_ordered)>1:
 | 
			
		||||
                        contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered, dtype=object)[index_by_text_par_con])
 | 
			
		||||
                    else:
 | 
			
		||||
                    try:
 | 
			
		||||
                        contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered, dtype=np.int32)[index_by_text_par_con])
 | 
			
		||||
                    except:
 | 
			
		||||
                        contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered, dtype=object)[index_by_text_par_con])
 | 
			
		||||
                    if self.light_version:
 | 
			
		||||
                        text_regions_p, contours_only_text_parent, contours_only_text_parent_h, all_box_coord, all_box_coord_h, all_found_textline_polygons, all_found_textline_polygons_h, slopes, slopes_h, contours_only_text_parent_d_ordered, contours_only_text_parent_h_d_ordered = check_any_text_region_in_model_one_is_main_or_header_light(text_regions_p, regions_fully, contours_only_text_parent, all_box_coord, all_found_textline_polygons, slopes, contours_only_text_parent_d_ordered)
 | 
			
		||||
                    else:
 | 
			
		||||
| 
						 | 
				
			
			@ -3957,9 +3958,9 @@ class Eynollah:
 | 
			
		|||
                    if np.abs(slope_deskew) < SLOPE_THRESHOLD:
 | 
			
		||||
                        order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent, contours_only_text_parent_h, boxes, textline_mask_tot)
 | 
			
		||||
                    else:
 | 
			
		||||
                        if len(contours_only_text_parent_d_ordered)>1:
 | 
			
		||||
                        try:
 | 
			
		||||
                            contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered, dtype=object)[index_by_text_par_con])
 | 
			
		||||
                        else:
 | 
			
		||||
                        except:
 | 
			
		||||
                            contours_only_text_parent_d_ordered = list(np.array(contours_only_text_parent_d_ordered, dtype=np.int32)[index_by_text_par_con])
 | 
			
		||||
                        order_text_new, id_of_texts_tot = self.do_order_of_regions(contours_only_text_parent_d_ordered, contours_only_text_parent_h, boxes_d, textline_mask_tot_d)
 | 
			
		||||
                    
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -137,6 +137,29 @@ class EynollahXmlWriter():
 | 
			
		|||
                points_co += ' '
 | 
			
		||||
            coords.set_points(points_co[:-1])
 | 
			
		||||
            
 | 
			
		||||
    def serialize_lines_in_dropcapital(self, text_region, all_found_textline_polygons, region_idx, page_coord, all_box_coord, slopes, counter, ocr_all_textlines_textregion):
 | 
			
		||||
        self.logger.debug('enter serialize_lines_in_region')
 | 
			
		||||
        for j in range(1):
 | 
			
		||||
            coords = CoordsType()
 | 
			
		||||
            textline = TextLineType(id=counter.next_line_id, Coords=coords)
 | 
			
		||||
            if ocr_all_textlines_textregion:
 | 
			
		||||
                textline.set_TextEquiv( [ TextEquivType(Unicode=ocr_all_textlines_textregion[j]) ] )
 | 
			
		||||
            text_region.add_TextLine(textline)
 | 
			
		||||
            #region_bboxes = all_box_coord[region_idx]
 | 
			
		||||
            points_co = ''
 | 
			
		||||
            for idx_contour_textline, contour_textline in enumerate(all_found_textline_polygons[j]):
 | 
			
		||||
                if len(contour_textline) == 2:
 | 
			
		||||
                    points_co += str(int((contour_textline[0] + page_coord[2]) / self.scale_x))
 | 
			
		||||
                    points_co += ','
 | 
			
		||||
                    points_co += str(int((contour_textline[1] + page_coord[0]) / self.scale_y))
 | 
			
		||||
                else:
 | 
			
		||||
                    points_co += str(int((contour_textline[0][0] + page_coord[2]) / self.scale_x))
 | 
			
		||||
                    points_co += ','
 | 
			
		||||
                    points_co += str(int((contour_textline[0][1] + page_coord[0])/self.scale_y))
 | 
			
		||||
 | 
			
		||||
                points_co += ' '
 | 
			
		||||
            coords.set_points(points_co[:-1])
 | 
			
		||||
 | 
			
		||||
    def write_pagexml(self, pcgts):
 | 
			
		||||
        out_fname = os.path.join(self.dir_out, self.image_filename_stem) + ".xml"
 | 
			
		||||
        self.logger.info("output filename: '%s'", out_fname)
 | 
			
		||||
| 
						 | 
				
			
			@ -251,8 +274,12 @@ class EynollahXmlWriter():
 | 
			
		|||
            self.serialize_lines_in_marginal(marginal, all_found_textline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, counter)
 | 
			
		||||
 | 
			
		||||
        for mm in range(len(found_polygons_drop_capitals)):
 | 
			
		||||
            page.add_TextRegion(TextRegionType(id=counter.next_region_id, type_='drop-capital',
 | 
			
		||||
                    Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_drop_capitals[mm], page_coord))))
 | 
			
		||||
            dropcapital = TextRegionType(id=counter.next_region_id, type_='drop-capital',
 | 
			
		||||
                    Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_drop_capitals[mm], page_coord)))
 | 
			
		||||
            page.add_TextRegion(dropcapital)
 | 
			
		||||
            all_box_coord_drop = None
 | 
			
		||||
            slopes_drop = None
 | 
			
		||||
            self.serialize_lines_in_dropcapital(dropcapital, [found_polygons_drop_capitals[mm]], mm, page_coord, all_box_coord_drop, slopes_drop, counter, ocr_all_textlines_textregion=None)
 | 
			
		||||
 | 
			
		||||
        for mm in range(len(found_polygons_text_region_img)):
 | 
			
		||||
            page.add_ImageRegion(ImageRegionType(id=counter.next_region_id, Coords=CoordsType(points=self.calculate_polygon_coords(found_polygons_text_region_img[mm], page_coord))))
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue