adding visualization of ocr text of xml file

This commit is contained in:
vahidrezanezhad 2025-08-06 22:33:42 +02:00
parent cf63bd92bc
commit 1fe31bdeb3
2 changed files with 152 additions and 0 deletions

View file

@ -3,6 +3,7 @@ import json
from gt_gen_utils import * from gt_gen_utils import *
from tqdm import tqdm from tqdm import tqdm
from pathlib import Path from pathlib import Path
from PIL import Image, ImageDraw, ImageFont
@click.group() @click.group()
def main(): def main():
@ -447,6 +448,86 @@ def visualize_layout_segmentation(xml_file, dir_xml, dir_out, dir_imgs):
cv2.imwrite(os.path.join(dir_out, f_name+'.png'), added_image) cv2.imwrite(os.path.join(dir_out, f_name+'.png'), added_image)
@main.command()
@click.option(
"--xml_file",
"-xml",
help="xml filename",
type=click.Path(exists=True, dir_okay=False),
)
@click.option(
"--dir_xml",
"-dx",
help="directory of GT page-xml files",
type=click.Path(exists=True, file_okay=False),
)
@click.option(
"--dir_out",
"-do",
help="directory where plots will be written",
type=click.Path(exists=True, file_okay=False),
)
def visualize_ocr_text(xml_file, dir_xml, dir_out):
assert xml_file or dir_xml, "A single xml file -xml or a dir of xml files -dx is required not both of them"
if dir_xml:
xml_files_ind = os.listdir(dir_xml)
else:
xml_files_ind = [xml_file]
font_path = "Charis-7.000/Charis-Regular.ttf" # Make sure this file exists!
font = ImageFont.truetype(font_path, 40)
for ind_xml in tqdm(xml_files_ind):
indexer = 0
#print(ind_xml)
#print('########################')
if dir_xml:
xml_file = os.path.join(dir_xml,ind_xml )
f_name = Path(ind_xml).stem
else:
xml_file = os.path.join(ind_xml )
f_name = Path(ind_xml).stem
print(f_name, 'f_name')
co_tetxlines, y_len, x_len, ocr_texts = get_textline_contours_and_ocr_text(xml_file)
total_bb_coordinates = []
image_text = Image.new("RGB", (x_len, y_len), "white")
draw = ImageDraw.Draw(image_text)
for index, cnt in enumerate(co_tetxlines):
x,y,w,h = cv2.boundingRect(cnt)
#total_bb_coordinates.append([x,y,w,h])
#fit_text_single_line
#x_bb = bb_ind[0]
#y_bb = bb_ind[1]
#w_bb = bb_ind[2]
#h_bb = bb_ind[3]
font = fit_text_single_line(draw, ocr_texts[index], font_path, w, int(h*0.4) )
##draw.rectangle([x_bb, y_bb, x_bb + w_bb, y_bb + h_bb], outline="red", width=2)
text_bbox = draw.textbbox((0, 0), ocr_texts[index], font=font)
text_width = text_bbox[2] - text_bbox[0]
text_height = text_bbox[3] - text_bbox[1]
text_x = x + (w - text_width) // 2 # Center horizontally
text_y = y + (h - text_height) // 2 # Center vertically
# Draw the text
draw.text((text_x, text_y), ocr_texts[index], fill="black", font=font)
image_text.save(os.path.join(dir_out, f_name+'.png'))
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View file

@ -9,6 +9,7 @@ import cv2
from shapely import geometry from shapely import geometry
from pathlib import Path from pathlib import Path
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from PIL import Image, ImageDraw, ImageFont
KERNEL = np.ones((5, 5), np.uint8) KERNEL = np.ones((5, 5), np.uint8)
@ -283,6 +284,76 @@ def get_textline_contours_for_visualization(xml_file):
return co_use_case, y_len, x_len return co_use_case, y_len, x_len
def get_textline_contours_and_ocr_text(xml_file):
tree1 = ET.parse(xml_file, parser = ET.XMLParser(encoding = 'iso-8859-5'))
root1=tree1.getroot()
alltags=[elem.tag for elem in root1.iter()]
link=alltags[0].split('}')[0]+'}'
for jj in root1.iter(link+'Page'):
y_len=int(jj.attrib['imageHeight'])
x_len=int(jj.attrib['imageWidth'])
region_tags = np.unique([x for x in alltags if x.endswith('TextLine')])
tag_endings = ['}TextLine','}textline']
co_use_case = []
ocr_textlines = []
for tag in region_tags:
if tag.endswith(tag_endings[0]) or tag.endswith(tag_endings[1]):
for nn in root1.iter(tag):
c_t_in = []
ocr_text_in = ['']
sumi = 0
for vv in nn.iter():
if vv.tag == link + 'Coords':
for childtest2 in nn:
if childtest2.tag.endswith("TextEquiv"):
for child_uc in childtest2:
if child_uc.tag.endswith("Unicode"):
text = child_uc.text
ocr_text_in[0]= text
coords = bool(vv.attrib)
if coords:
p_h = vv.attrib['points'].split(' ')
c_t_in.append(
np.array([[int(x.split(',')[0]), int(x.split(',')[1])] for x in p_h]))
break
else:
pass
if vv.tag == link + 'Point':
c_t_in.append([int(float(vv.attrib['x'])), int(float(vv.attrib['y']))])
sumi += 1
elif vv.tag != link + 'Point' and sumi >= 1:
break
co_use_case.append(np.array(c_t_in))
ocr_textlines.append(ocr_text_in[0])
return co_use_case, y_len, x_len, ocr_textlines
def fit_text_single_line(draw, text, font_path, max_width, max_height):
initial_font_size = 50
font_size = initial_font_size
while font_size > 10: # Minimum font size
font = ImageFont.truetype(font_path, font_size)
text_bbox = draw.textbbox((0, 0), text, font=font) # Get text bounding box
text_width = text_bbox[2] - text_bbox[0]
text_height = text_bbox[3] - text_bbox[1]
if text_width <= max_width and text_height <= max_height:
return font # Return the best-fitting font
font_size -= 2 # Reduce font size and retry
return ImageFont.truetype(font_path, 10) # Smallest font fallback
def get_layout_contours_for_visualization(xml_file): def get_layout_contours_for_visualization(xml_file):
tree1 = ET.parse(xml_file, parser = ET.XMLParser(encoding = 'iso-8859-5')) tree1 = ET.parse(xml_file, parser = ET.XMLParser(encoding = 'iso-8859-5'))
root1=tree1.getroot() root1=tree1.getroot()