adding visualization of ocr text of xml file

2026-01-09 20:07:12 +01:00 · 2025-08-06 22:33:42 +02:00 · 2025-08-06 22:33:42 +02:00 · 1fe31bdeb3
commit 1fe31bdeb3
parent cf63bd92bc
2 changed files with 152 additions and 0 deletions
--- a/generate_gt_for_training.py
+++ b/generate_gt_for_training.py
@ -3,6 +3,7 @@ import json
 from gt_gen_utils import *
 from tqdm import tqdm
 from pathlib import Path
 from PIL import Image, ImageDraw, ImageFont
@click.group()
 def main():
@ -447,6 +448,86 @@ def visualize_layout_segmentation(xml_file, dir_xml, dir_out, dir_imgs):
        cv2.imwrite(os.path.join(dir_out, f_name+'.png'), added_image)
@main.command()
@click.option(
    "--xml_file",
    "-xml",
    help="xml filename",
    type=click.Path(exists=True, dir_okay=False),
 )
@click.option(
    "--dir_xml",
    "-dx",
    help="directory of GT page-xml files",
    type=click.Path(exists=True, file_okay=False),
 )
@click.option(
    "--dir_out",
    "-do",
    help="directory where plots will be written",
    type=click.Path(exists=True, file_okay=False),
 )
 def visualize_ocr_text(xml_file, dir_xml, dir_out):
    assert xml_file or dir_xml, "A single xml file -xml or a dir of xml files -dx is required not both of them"
    if dir_xml:
        xml_files_ind = os.listdir(dir_xml)
    else:
        xml_files_ind = [xml_file]
    font_path = "Charis-7.000/Charis-Regular.ttf"  # Make sure this file exists!
    font = ImageFont.truetype(font_path, 40)
    for ind_xml in tqdm(xml_files_ind):
        indexer = 0
        #print(ind_xml)
        #print('########################')
        if dir_xml:
            xml_file = os.path.join(dir_xml,ind_xml )
            f_name = Path(ind_xml).stem
        else:
            xml_file = os.path.join(ind_xml )
            f_name = Path(ind_xml).stem
        print(f_name, 'f_name')
        co_tetxlines, y_len, x_len, ocr_texts = get_textline_contours_and_ocr_text(xml_file)
        total_bb_coordinates = []
        image_text = Image.new("RGB", (x_len, y_len), "white")
        draw = ImageDraw.Draw(image_text)
        for index, cnt in enumerate(co_tetxlines):
            x,y,w,h = cv2.boundingRect(cnt)
            #total_bb_coordinates.append([x,y,w,h])
            #fit_text_single_line
            #x_bb = bb_ind[0]
            #y_bb = bb_ind[1]
            #w_bb = bb_ind[2]
            #h_bb = bb_ind[3]
            font = fit_text_single_line(draw, ocr_texts[index], font_path, w, int(h*0.4) )
            ##draw.rectangle([x_bb, y_bb, x_bb + w_bb, y_bb + h_bb], outline="red", width=2)
            text_bbox = draw.textbbox((0, 0), ocr_texts[index], font=font)
            text_width = text_bbox[2] - text_bbox[0]
            text_height = text_bbox[3] - text_bbox[1]
            text_x = x + (w - text_width) // 2  # Center horizontally
            text_y = y + (h - text_height) // 2  # Center vertically
            # Draw the text
            draw.text((text_x, text_y), ocr_texts[index], fill="black", font=font)
        image_text.save(os.path.join(dir_out, f_name+'.png'))
 if __name__ == "__main__":
    main()
--- a/gt_gen_utils.py
+++ b/gt_gen_utils.py
@ -9,6 +9,7 @@ import cv2
 from shapely import geometry
 from pathlib import Path
 import matplotlib.pyplot as plt
 from PIL import Image, ImageDraw, ImageFont
 KERNEL = np.ones((5, 5), np.uint8)
@ -283,6 +284,76 @@ def get_textline_contours_for_visualization(xml_file):
    return co_use_case, y_len, x_len
 def get_textline_contours_and_ocr_text(xml_file):
    tree1 = ET.parse(xml_file, parser = ET.XMLParser(encoding = 'iso-8859-5'))
    root1=tree1.getroot()
    alltags=[elem.tag for elem in root1.iter()]
    link=alltags[0].split('}')[0]+'}'
    for jj in root1.iter(link+'Page'):
        y_len=int(jj.attrib['imageHeight'])
        x_len=int(jj.attrib['imageWidth'])
    region_tags = np.unique([x for x in alltags if x.endswith('TextLine')])
    tag_endings = ['}TextLine','}textline']
    co_use_case = []
    ocr_textlines = []
    for tag in region_tags:
        if tag.endswith(tag_endings[0]) or tag.endswith(tag_endings[1]):
            for nn in root1.iter(tag):
                c_t_in = []
                ocr_text_in = ['']
                sumi = 0
                for vv in nn.iter():
                    if vv.tag == link + 'Coords':
                        for childtest2 in nn:
                            if childtest2.tag.endswith("TextEquiv"):
                                for child_uc in childtest2:
                                    if child_uc.tag.endswith("Unicode"):
                                        text = child_uc.text
                                        ocr_text_in[0]= text
                        coords = bool(vv.attrib)
                        if coords:
                            p_h = vv.attrib['points'].split(' ')
                            c_t_in.append(
                                np.array([[int(x.split(',')[0]), int(x.split(',')[1])] for x in p_h]))
                            break
                        else:
                            pass
                    if vv.tag == link + 'Point':
                        c_t_in.append([int(float(vv.attrib['x'])), int(float(vv.attrib['y']))])
                        sumi += 1
                    elif vv.tag != link + 'Point' and sumi >= 1:
                        break
                co_use_case.append(np.array(c_t_in))
                ocr_textlines.append(ocr_text_in[0])
    return co_use_case, y_len, x_len, ocr_textlines
 def fit_text_single_line(draw, text, font_path, max_width, max_height):
    initial_font_size = 50
    font_size = initial_font_size
    while font_size > 10:  # Minimum font size
        font = ImageFont.truetype(font_path, font_size)
        text_bbox = draw.textbbox((0, 0), text, font=font)  # Get text bounding box
        text_width = text_bbox[2] - text_bbox[0]
        text_height = text_bbox[3] - text_bbox[1]
        if text_width <= max_width and text_height <= max_height:
            return font  # Return the best-fitting font
        font_size -= 2  # Reduce font size and retry
    return ImageFont.truetype(font_path, 10)  # Smallest font fallback
 def get_layout_contours_for_visualization(xml_file):
    tree1 = ET.parse(xml_file, parser = ET.XMLParser(encoding = 'iso-8859-5'))
    root1=tree1.getroot()