In OCR, the predicted text is now drawn on the image, and the results are saved in a specified directory. This makes it easier to review the predicted output

main
vahidrezanezhad 4 days ago
parent 9b04688ebc
commit b1da0a3327

@ -334,6 +334,12 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_
help="directory of xmls",
type=click.Path(exists=True, file_okay=False),
)
@click.option(
"--dir_out_image_text",
"-doit",
help="directory of images with predicted text",
type=click.Path(exists=True, file_okay=False),
)
@click.option(
"--model",
"-m",
@ -359,6 +365,12 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_
is_flag=True,
help="if this parameter set to true, cropped textline images will not be masked with textline contour.",
)
@click.option(
"--draw_texts_on_image",
"-dtoi/-ndtoi",
is_flag=True,
help="if this parameter set to true, the predicted texts will be displayed on an image.",
)
@click.option(
"--log_level",
"-l",
@ -366,18 +378,20 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_
help="Override log level globally to this",
)
def ocr(dir_in, out, dir_xmls, model, tr_ocr, export_textline_images_and_text, do_not_mask_with_textline_contour, log_level):
def ocr(dir_in, out, dir_xmls, dir_out_image_text, model, tr_ocr, export_textline_images_and_text, do_not_mask_with_textline_contour, draw_texts_on_image, log_level):
if log_level:
setOverrideLogLevel(log_level)
initLogging()
eynollah_ocr = Eynollah_ocr(
dir_xmls=dir_xmls,
dir_out_image_text=dir_out_image_text,
dir_in=dir_in,
dir_out=out,
dir_models=model,
tr_ocr=tr_ocr,
export_textline_images_and_text=export_textline_images_and_text,
do_not_mask_with_textline_contour=do_not_mask_with_textline_contour,
draw_texts_on_image=draw_texts_on_image,
)
eynollah_ocr.run()

@ -22,7 +22,7 @@ from ocrd_utils import getLogger
import cv2
import numpy as np
from transformers import TrOCRProcessor
from PIL import Image
from PIL import Image, ImageDraw, ImageFont
import torch
from difflib import SequenceMatcher as sq
from transformers import VisionEncoderDecoderModel
@ -4409,7 +4409,6 @@ class Eynollah:
text_regions_p_1 ,erosion_hurts, polygons_lines_xml, textline_mask_tot_ea, img_bin_light = \
self.get_regions_light_v(img_res, is_image_enhanced, num_col_classifier)
#print("text region early -2 in %.1fs", time.time() - t0)
if num_col_classifier == 1 or num_col_classifier ==2:
if num_col_classifier == 1:
img_w_new = 1000
@ -4954,9 +4953,11 @@ class Eynollah_ocr:
dir_xmls=None,
dir_in=None,
dir_out=None,
dir_out_image_text=None,
tr_ocr=False,
export_textline_images_and_text=False,
do_not_mask_with_textline_contour=False,
draw_texts_on_image=False,
logger=None,
):
self.dir_in = dir_in
@ -4966,6 +4967,8 @@ class Eynollah_ocr:
self.tr_ocr = tr_ocr
self.export_textline_images_and_text = export_textline_images_and_text
self.do_not_mask_with_textline_contour = do_not_mask_with_textline_contour
self.draw_texts_on_image = draw_texts_on_image
self.dir_out_image_text = dir_out_image_text
if tr_ocr:
self.processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")
self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
@ -5083,6 +5086,23 @@ class Eynollah_ocr:
return peaks_final
else:
return None
# Function to fit text inside the given area
def fit_text_single_line(self, draw, text, font_path, max_width, max_height):
initial_font_size = 50
font_size = initial_font_size
while font_size > 10: # Minimum font size
font = ImageFont.truetype(font_path, font_size)
text_bbox = draw.textbbox((0, 0), text, font=font) # Get text bounding box
text_width = text_bbox[2] - text_bbox[0]
text_height = text_bbox[3] - text_bbox[1]
if text_width <= max_width and text_height <= max_height:
return font # Return the best-fitting font
font_size -= 2 # Reduce font size and retry
return ImageFont.truetype(font_path, 10) # Smallest font fallback
def return_textlines_split_if_needed(self, textline_image):
@ -5254,6 +5274,12 @@ class Eynollah_ocr:
dir_xml = os.path.join(self.dir_xmls, file_name+'.xml')
out_file_ocr = os.path.join(self.dir_out, file_name+'.xml')
img = cv2.imread(dir_img)
if self.draw_texts_on_image:
out_image_with_text = os.path.join(self.dir_out_image_text, file_name+'.png')
image_text = Image.new("RGB", (img.shape[1], img.shape[0]), "white")
draw = ImageDraw.Draw(image_text)
total_bb_coordinates = []
tree1 = ET.parse(dir_xml, parser = ET.XMLParser(encoding="utf-8"))
root1=tree1.getroot()
@ -5283,6 +5309,9 @@ class Eynollah_ocr:
x,y,w,h = cv2.boundingRect(textline_coords)
if self.draw_texts_on_image:
total_bb_coordinates.append([x,y,w,h])
h2w_ratio = h/float(w)
img_poly_on_img = np.copy(img)
@ -5359,6 +5388,35 @@ class Eynollah_ocr:
extracted_texts_merged = [ind for ind in extracted_texts_merged if ind is not None]
unique_cropped_lines_region_indexer = np.unique(cropped_lines_region_indexer)
if self.draw_texts_on_image:
font_path = "NotoSans-Regular.ttf" # Make sure this file exists!
font = ImageFont.truetype(font_path, 40)
for indexer_text, bb_ind in enumerate(total_bb_coordinates):
x_bb = bb_ind[0]
y_bb = bb_ind[1]
w_bb = bb_ind[2]
h_bb = bb_ind[3]
font = self.fit_text_single_line(draw, extracted_texts_merged[indexer_text], font_path, w_bb, int(h_bb*0.4) )
##draw.rectangle([x_bb, y_bb, x_bb + w_bb, y_bb + h_bb], outline="red", width=2)
text_bbox = draw.textbbox((0, 0), extracted_texts_merged[indexer_text], font=font)
text_width = text_bbox[2] - text_bbox[0]
text_height = text_bbox[3] - text_bbox[1]
text_x = x_bb + (w_bb - text_width) // 2 # Center horizontally
text_y = y_bb + (h_bb - text_height) // 2 # Center vertically
# Draw the text
draw.text((text_x, text_y), extracted_texts_merged[indexer_text], fill="black", font=font)
image_text.save(out_image_with_text)
text_by_textregion = []
for ind in unique_cropped_lines_region_indexer:

Loading…
Cancel
Save