diff --git a/sbb_newspapers_org_image/eynollah.py b/sbb_newspapers_org_image/eynollah.py index f3abfd3..e82078e 100644 --- a/sbb_newspapers_org_image/eynollah.py +++ b/sbb_newspapers_org_image/eynollah.py @@ -1404,6 +1404,83 @@ class eynollah: poly.put(poly_sub) box_sub.put(boxes_sub_new) + def serialize_lines_in_region(self, textregion, all_found_texline_polygons, region_idx, page_coord, all_box_coord, slopes): + for j in range(len(all_found_texline_polygons[region_idx])): + textline=ET.SubElement(textregion, 'TextLine') + textline.set('id','l'+str(id_indexer_l)) + id_indexer_l+=1 + coord = ET.SubElement(textline, 'Coords') + texteq = ET.SubElement(textline, 'TextEquiv') + uni = ET.SubElement(texteq, 'Unicode') + uni.text = ' ' + + #points = ET.SubElement(coord, 'Points') + + points_co='' + for l in range(len(all_found_texline_polygons[region_idx][j])): + if not self.curved_line: + #point.set('x',str(found_polygons[j][l][0])) + #point.set('y',str(found_polygons[j][l][1])) + if len(all_found_texline_polygons[region_idx][j][l])==2: + textline_x_coord=int( (all_found_texline_polygons[region_idx][j][l][0] + +all_box_coord[region_idx][2]+page_coord[2])/self.scale_x) + textline_y_coord=int( (all_found_texline_polygons[region_idx][j][l][1] + +all_box_coord[region_idx][0]+page_coord[0])/self.scale_y) + + if textline_x_coord<0: + textline_x_coord=0 + if textline_y_coord<0: + textline_y_coord=0 + points_co=points_co+str( textline_x_coord ) + points_co=points_co+',' + points_co=points_co+str( textline_y_coord ) + else: + + textline_x_coord=int( ( all_found_texline_polygons[region_idx][j][l][0][0] + +all_box_coord[region_idx][2]+page_coord[2])/self.scale_x ) + textline_y_coord=int( ( all_found_texline_polygons[region_idx][j][l][0][1] + +all_box_coord[region_idx][0]+page_coord[0])/self.scale_y) + + if textline_x_coord<0: + textline_x_coord=0 + if textline_y_coord<0: + textline_y_coord=0 + + points_co=points_co+str( textline_x_coord ) + points_co=points_co+',' + points_co=points_co+str( textline_y_coord ) + + if (self.curved_line) and np.abs(slopes[region_idx]) <= 45 : + if len(all_found_texline_polygons[region_idx][j][l])==2: + points_co=points_co+str( int( (all_found_texline_polygons[region_idx][j][l][0] + +page_coord[2])/self.scale_x) ) + points_co=points_co+',' + points_co=points_co+str( int( (all_found_texline_polygons[region_idx][j][l][1] + +page_coord[0])/self.scale_y) ) + else: + points_co=points_co+str( int( ( all_found_texline_polygons[region_idx][j][l][0][0] + +page_coord[2])/self.scale_x ) ) + points_co=points_co+',' + points_co=points_co+str( int( ( all_found_texline_polygons[region_idx][j][l][0][1] + +page_coord[0])/self.scale_y) ) + elif (self.curved_line) and np.abs(slopes[region_idx]) > 45 : + if len(all_found_texline_polygons[region_idx][j][l])==2: + points_co=points_co+str( int( (all_found_texline_polygons[region_idx][j][l][0] + +all_box_coord[region_idx][2]+page_coord[2])/self.scale_x) ) + points_co=points_co+',' + points_co=points_co+str( int( (all_found_texline_polygons[region_idx][j][l][1] + +all_box_coord[region_idx][0]+page_coord[0])/self.scale_y) ) + else: + points_co=points_co+str( int( ( all_found_texline_polygons[region_idx][j][l][0][0] + +all_box_coord[region_idx][2]+page_coord[2])/self.scale_x ) ) + points_co=points_co+',' + points_co=points_co+str( int( ( all_found_texline_polygons[region_idx][j][l][0][1] + +all_box_coord[region_idx][0]+page_coord[0])/self.scale_y) ) + + if l<(len(all_found_texline_polygons[region_idx][j])-1): + points_co=points_co+' ' + coord.set('points',points_co) + def write_into_page_xml_full(self, contours, contours_h, page_coord, dir_of_image, order_of_texts, id_of_texts, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals): found_polygons_text_region = contours @@ -1460,110 +1537,20 @@ class eynollah: points_co='' for lmm in range(len(found_polygons_text_region[mm])): if len(found_polygons_text_region[mm][lmm])==2: - points_co=points_co+str( int( (found_polygons_text_region[mm][lmm][0] +page_coord[2])/self.scale_x ) ) + points_co=points_co+str(int((found_polygons_text_region[mm][lmm][0] + page_coord[2]) / self.scale_x)) points_co=points_co+',' - points_co=points_co+str( int( (found_polygons_text_region[mm][lmm][1] +page_coord[0])/self.scale_y ) ) + points_co=points_co+str(int((found_polygons_text_region[mm][lmm][1] + page_coord[0]) / self.scale_y)) else: - points_co=points_co+str( int((found_polygons_text_region[mm][lmm][0][0] +page_coord[2])/self.scale_x) ) + points_co=points_co+str(int((found_polygons_text_region[mm][lmm][0][0] + page_coord[2]) / self.scale_x)) points_co=points_co+',' - points_co=points_co+str( int((found_polygons_text_region[mm][lmm][0][1] +page_coord[0])/self.scale_y) ) + points_co=points_co+str(int((found_polygons_text_region[mm][lmm][0][1] + page_coord[0]) / self.scale_y)) if lmm<(len(found_polygons_text_region[mm])-1): points_co=points_co+' ' #print(points_co) coord_text.set('points',points_co) - - - - - - for j in range(len(all_found_texline_polygons[mm])): - - textline=ET.SubElement(textregion, 'TextLine') - - textline.set('id','l'+str(id_indexer_l)) - - id_indexer_l+=1 - - - coord = ET.SubElement(textline, 'Coords') - - texteq=ET.SubElement(textline, 'TextEquiv') - - uni=ET.SubElement(texteq, 'Unicode') - uni.text = ' ' - - #points = ET.SubElement(coord, 'Points') - - points_co='' - for l in range(len(all_found_texline_polygons[mm][j])): - #point = ET.SubElement(coord, 'Point') - - - if not self.curved_line: - #point.set('x',str(found_polygons[j][l][0])) - #point.set('y',str(found_polygons[j][l][1])) - if len(all_found_texline_polygons[mm][j][l])==2: - textline_x_coord=int( (all_found_texline_polygons[mm][j][l][0] - +all_box_coord[mm][2]+page_coord[2])/self.scale_x) - textline_y_coord=int( (all_found_texline_polygons[mm][j][l][1] - +all_box_coord[mm][0]+page_coord[0])/self.scale_y) - - if textline_x_coord<0: - textline_x_coord=0 - if textline_y_coord<0: - textline_y_coord=0 - points_co=points_co+str( textline_x_coord ) - points_co=points_co+',' - points_co=points_co+str( textline_y_coord ) - else: - - textline_x_coord=int( ( all_found_texline_polygons[mm][j][l][0][0] - +all_box_coord[mm][2]+page_coord[2])/self.scale_x ) - textline_y_coord=int( ( all_found_texline_polygons[mm][j][l][0][1] - +all_box_coord[mm][0]+page_coord[0])/self.scale_y) - - if textline_x_coord<0: - textline_x_coord=0 - if textline_y_coord<0: - textline_y_coord=0 - - points_co=points_co+str( textline_x_coord ) - points_co=points_co+',' - points_co=points_co+str( textline_y_coord ) - - if (self.curved_line) and np.abs(slopes[mm]) <= 45 : - if len(all_found_texline_polygons[mm][j][l])==2: - points_co=points_co+str( int( (all_found_texline_polygons[mm][j][l][0] - +page_coord[2])/self.scale_x) ) - points_co=points_co+',' - points_co=points_co+str( int( (all_found_texline_polygons[mm][j][l][1] - +page_coord[0])/self.scale_y) ) - else: - points_co=points_co+str( int( ( all_found_texline_polygons[mm][j][l][0][0] - +page_coord[2])/self.scale_x ) ) - points_co=points_co+',' - points_co=points_co+str( int( ( all_found_texline_polygons[mm][j][l][0][1] - +page_coord[0])/self.scale_y) ) - elif (self.curved_line) and np.abs(slopes[mm]) > 45 : - if len(all_found_texline_polygons[mm][j][l])==2: - points_co=points_co+str( int( (all_found_texline_polygons[mm][j][l][0] - +all_box_coord[mm][2]+page_coord[2])/self.scale_x) ) - points_co=points_co+',' - points_co=points_co+str( int( (all_found_texline_polygons[mm][j][l][1] - +all_box_coord[mm][0]+page_coord[0])/self.scale_y) ) - else: - points_co=points_co+str( int( ( all_found_texline_polygons[mm][j][l][0][0] - +all_box_coord[mm][2]+page_coord[2])/self.scale_x ) ) - points_co=points_co+',' - points_co=points_co+str( int( ( all_found_texline_polygons[mm][j][l][0][1] - +all_box_coord[mm][0]+page_coord[0])/self.scale_y) ) - - if l<(len(all_found_texline_polygons[mm][j])-1): - points_co=points_co+' ' - #print(points_co) - coord.set('points',points_co) - + + self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord) texteqreg=ET.SubElement(textregion, 'TextEquiv') unireg=ET.SubElement(texteqreg, 'Unicode') @@ -1590,102 +1577,8 @@ class eynollah: # textregion.set('type','paragraph') coord_text = ET.SubElement(textregion, 'Coords') - points_co='' - for lmm in range(len(found_polygons_text_region_h[mm])): - - if len(found_polygons_text_region_h[mm][lmm])==2: - - points_co=points_co+str( int( (found_polygons_text_region_h[mm][lmm][0] +page_coord[2])/self.scale_x ) ) - points_co=points_co+',' - points_co=points_co+str( int( (found_polygons_text_region_h[mm][lmm][1] +page_coord[0])/self.scale_y ) ) - else: - points_co=points_co+str( int((found_polygons_text_region_h[mm][lmm][0][0] +page_coord[2])/self.scale_x) ) - points_co=points_co+',' - points_co=points_co+str( int((found_polygons_text_region_h[mm][lmm][0][1] +page_coord[0])/self.scale_y) ) - - if lmm<(len(found_polygons_text_region_h[mm])-1): - points_co=points_co+' ' - #print(points_co) - coord_text.set('points',points_co) - - - for j in range(len(all_found_texline_polygons_h[mm])): - - textline=ET.SubElement(textregion, 'TextLine') - - textline.set('id','l'+str(id_indexer_l)) - - id_indexer_l+=1 - - - coord = ET.SubElement(textline, 'Coords') - - texteq=ET.SubElement(textline, 'TextEquiv') - - uni=ET.SubElement(texteq, 'Unicode') - uni.text = ' ' - - #points = ET.SubElement(coord, 'Points') - - points_co='' - for l in range(len(all_found_texline_polygons_h[mm][j])): - #point = ET.SubElement(coord, 'Point') - - - if not self.curved_line: - #point.set('x',str(found_polygons[j][l][0])) - #point.set('y',str(found_polygons[j][l][1])) - if len(all_found_texline_polygons_h[mm][j][l])==2: - - textline_x_coord=int( (all_found_texline_polygons_h[mm][j][l][0] - +all_box_coord_h[mm][2]+page_coord[2])/self.scale_x) - textline_y_coord=int( (all_found_texline_polygons_h[mm][j][l][1] - +all_box_coord_h[mm][0]+page_coord[0])/self.scale_y) - - if textline_x_coord<0: - textline_x_coord=0 - if textline_y_coord<0: - textline_y_coord=0 - - points_co=points_co+str( textline_x_coord ) - points_co=points_co+',' - points_co=points_co+str( textline_y_coord ) - else: - - - textline_x_coord=int( ( all_found_texline_polygons_h[mm][j][l][0][0] - +all_box_coord_h[mm][2]+page_coord[2])/self.scale_x ) - textline_y_coord=int( ( all_found_texline_polygons_h[mm][j][l][0][1] - +all_box_coord_h[mm][0]+page_coord[0])/self.scale_y) - - if textline_x_coord<0: - textline_x_coord=0 - if textline_y_coord<0: - textline_y_coord=0 - - points_co=points_co+str( textline_x_coord ) - points_co=points_co+',' - points_co=points_co+str( textline_y_coord) - - if self.curved_line: - if len(all_found_texline_polygons_h[mm][j][l])==2: - points_co=points_co+str( int( (all_found_texline_polygons_h[mm][j][l][0] - +page_coord[2])/self.scale_x) ) - points_co=points_co+',' - points_co=points_co+str( int( (all_found_texline_polygons_h[mm][j][l][1] - +page_coord[0])/self.scale_y) ) - else: - points_co=points_co+str( int( ( all_found_texline_polygons_h[mm][j][l][0][0] - +page_coord[2])/self.scale_x ) ) - points_co=points_co+',' - points_co=points_co+str( int( ( all_found_texline_polygons_h[mm][j][l][0][1] - +page_coord[0])/self.scale_y) ) - - if l<(len(all_found_texline_polygons_h[mm][j])-1): - points_co=points_co+' ' - #print(points_co) - coord.set('points',points_co) + self.serialize_lines_in_region(textregion, all_found_texline_polygons_h, mm, page_coord, all_box_coord, slopes) texteqreg=ET.SubElement(textregion, 'TextEquiv') unireg=ET.SubElement(texteqreg, 'Unicode')