loading xmls with UTF-8 encoding

2026-06-19 03:19:14 +02:00 · 2025-08-07 10:32:49 +02:00 · 2025-08-07 10:32:49 +02:00 · ef0f08ec1f
commit ef0f08ec1f
parent 1fe31bdeb3
2 changed files with 18 additions and 18 deletions
--- a/generate_gt_for_training.py
+++ b/generate_gt_for_training.py
@ -513,20 +513,20 @@ def visualize_ocr_text(xml_file, dir_xml, dir_out):
            #y_bb = bb_ind[1]
            #w_bb = bb_ind[2]
            #h_bb = bb_ind[3]
+            if ocr_texts[index]:
+                font = fit_text_single_line(draw, ocr_texts[index], font_path, w, int(h*0.4) )
                
-            font = fit_text_single_line(draw, ocr_texts[index], font_path, w, int(h*0.4) )
+                ##draw.rectangle([x_bb, y_bb, x_bb + w_bb, y_bb + h_bb], outline="red", width=2)
                
-            ##draw.rectangle([x_bb, y_bb, x_bb + w_bb, y_bb + h_bb], outline="red", width=2)
+                text_bbox = draw.textbbox((0, 0), ocr_texts[index], font=font)
+                text_width = text_bbox[2] - text_bbox[0]
+                text_height = text_bbox[3] - text_bbox[1]

-            text_bbox = draw.textbbox((0, 0), ocr_texts[index], font=font)
-            text_width = text_bbox[2] - text_bbox[0]
-            text_height = text_bbox[3] - text_bbox[1]
+                text_x = x + (w - text_width) // 2  # Center horizontally
+                text_y = y + (h - text_height) // 2  # Center vertically

-            text_x = x + (w - text_width) // 2  # Center horizontally
-            text_y = y + (h - text_height) // 2  # Center vertically
-
-            # Draw the text
-            draw.text((text_x, text_y), ocr_texts[index], fill="black", font=font)
+                # Draw the text
+                draw.text((text_x, text_y), ocr_texts[index], fill="black", font=font)
        image_text.save(os.path.join(dir_out, f_name+'.png'))
    
 if __name__ == "__main__":
--- a/gt_gen_utils.py
+++ b/gt_gen_utils.py
@ -244,7 +244,7 @@ def update_region_contours(co_text, img_boundary, erosion_rate, dilation_rate, y
    return co_text_eroded, img_boundary

 def get_textline_contours_for_visualization(xml_file):
-    tree1 = ET.parse(xml_file, parser = ET.XMLParser(encoding = 'iso-8859-5'))
+    tree1 = ET.parse(xml_file, parser = ET.XMLParser(encoding='utf-8'))
    root1=tree1.getroot()
    alltags=[elem.tag for elem in root1.iter()]
    link=alltags[0].split('}')[0]+'}'
@ -285,7 +285,7 @@ def get_textline_contours_for_visualization(xml_file):


 def get_textline_contours_and_ocr_text(xml_file):
-    tree1 = ET.parse(xml_file, parser = ET.XMLParser(encoding = 'iso-8859-5'))
+    tree1 = ET.parse(xml_file, parser = ET.XMLParser(encoding='utf-8'))
    root1=tree1.getroot()
    alltags=[elem.tag for elem in root1.iter()]
    link=alltags[0].split('}')[0]+'}'
@ -355,7 +355,7 @@ def fit_text_single_line(draw, text, font_path, max_width, max_height):
    return ImageFont.truetype(font_path, 10)  # Smallest font fallback

 def get_layout_contours_for_visualization(xml_file):
-    tree1 = ET.parse(xml_file, parser = ET.XMLParser(encoding = 'iso-8859-5'))
+    tree1 = ET.parse(xml_file, parser = ET.XMLParser(encoding='utf-8'))
    root1=tree1.getroot()
    alltags=[elem.tag for elem in root1.iter()]
    link=alltags[0].split('}')[0]+'}'
@ -630,7 +630,7 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_
    for index in tqdm(range(len(gt_list))):
        #try:
        print(gt_list[index])
-        tree1 = ET.parse(dir_in+'/'+gt_list[index], parser = ET.XMLParser(encoding = 'iso-8859-5'))
+        tree1 = ET.parse(dir_in+'/'+gt_list[index], parser = ET.XMLParser(encoding='utf-8'))
        root1=tree1.getroot()
        alltags=[elem.tag for elem in root1.iter()]
        link=alltags[0].split('}')[0]+'}'
@ -1311,7 +1311,7 @@ def find_new_features_of_contours(contours_main):
    return cx_main, cy_main, x_min_main, x_max_main, y_min_main, y_max_main, y_corr_x_min_from_argmin
 def read_xml(xml_file):
    file_name = Path(xml_file).stem
-    tree1 = ET.parse(xml_file, parser = ET.XMLParser(encoding = 'iso-8859-5'))
+    tree1 = ET.parse(xml_file, parser = ET.XMLParser(encoding='utf-8'))
    root1=tree1.getroot()
    alltags=[elem.tag for elem in root1.iter()]
    link=alltags[0].split('}')[0]+'}'