diff --git a/src/eynollah/cli.py b/src/eynollah/cli.py index c306ac5..369dc4c 100644 --- a/src/eynollah/cli.py +++ b/src/eynollah/cli.py @@ -334,6 +334,12 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_ help="directory of xmls", type=click.Path(exists=True, file_okay=False), ) +@click.option( + "--dir_out_image_text", + "-doit", + help="directory of images with predicted text", + type=click.Path(exists=True, file_okay=False), +) @click.option( "--model", "-m", @@ -359,6 +365,12 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_ is_flag=True, help="if this parameter set to true, cropped textline images will not be masked with textline contour.", ) +@click.option( + "--draw_texts_on_image", + "-dtoi/-ndtoi", + is_flag=True, + help="if this parameter set to true, the predicted texts will be displayed on an image.", +) @click.option( "--log_level", "-l", @@ -366,18 +378,20 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_ help="Override log level globally to this", ) -def ocr(dir_in, out, dir_xmls, model, tr_ocr, export_textline_images_and_text, do_not_mask_with_textline_contour, log_level): +def ocr(dir_in, out, dir_xmls, dir_out_image_text, model, tr_ocr, export_textline_images_and_text, do_not_mask_with_textline_contour, draw_texts_on_image, log_level): if log_level: setOverrideLogLevel(log_level) initLogging() eynollah_ocr = Eynollah_ocr( dir_xmls=dir_xmls, + dir_out_image_text=dir_out_image_text, dir_in=dir_in, dir_out=out, dir_models=model, tr_ocr=tr_ocr, export_textline_images_and_text=export_textline_images_and_text, do_not_mask_with_textline_contour=do_not_mask_with_textline_contour, + draw_texts_on_image=draw_texts_on_image, ) eynollah_ocr.run() diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 9ead53e..0b93085 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -22,7 +22,7 @@ from ocrd_utils import getLogger import cv2 import numpy as np from transformers import TrOCRProcessor -from PIL import Image +from PIL import Image, ImageDraw, ImageFont import torch from difflib import SequenceMatcher as sq from transformers import VisionEncoderDecoderModel @@ -4409,7 +4409,6 @@ class Eynollah: text_regions_p_1 ,erosion_hurts, polygons_lines_xml, textline_mask_tot_ea, img_bin_light = \ self.get_regions_light_v(img_res, is_image_enhanced, num_col_classifier) #print("text region early -2 in %.1fs", time.time() - t0) - if num_col_classifier == 1 or num_col_classifier ==2: if num_col_classifier == 1: img_w_new = 1000 @@ -4954,9 +4953,11 @@ class Eynollah_ocr: dir_xmls=None, dir_in=None, dir_out=None, + dir_out_image_text=None, tr_ocr=False, export_textline_images_and_text=False, do_not_mask_with_textline_contour=False, + draw_texts_on_image=False, logger=None, ): self.dir_in = dir_in @@ -4966,6 +4967,8 @@ class Eynollah_ocr: self.tr_ocr = tr_ocr self.export_textline_images_and_text = export_textline_images_and_text self.do_not_mask_with_textline_contour = do_not_mask_with_textline_contour + self.draw_texts_on_image = draw_texts_on_image + self.dir_out_image_text = dir_out_image_text if tr_ocr: self.processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed") self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") @@ -5083,6 +5086,23 @@ class Eynollah_ocr: return peaks_final else: return None + + # Function to fit text inside the given area + def fit_text_single_line(self, draw, text, font_path, max_width, max_height): + initial_font_size = 50 + font_size = initial_font_size + while font_size > 10: # Minimum font size + font = ImageFont.truetype(font_path, font_size) + text_bbox = draw.textbbox((0, 0), text, font=font) # Get text bounding box + text_width = text_bbox[2] - text_bbox[0] + text_height = text_bbox[3] - text_bbox[1] + + if text_width <= max_width and text_height <= max_height: + return font # Return the best-fitting font + + font_size -= 2 # Reduce font size and retry + + return ImageFont.truetype(font_path, 10) # Smallest font fallback def return_textlines_split_if_needed(self, textline_image): @@ -5254,6 +5274,12 @@ class Eynollah_ocr: dir_xml = os.path.join(self.dir_xmls, file_name+'.xml') out_file_ocr = os.path.join(self.dir_out, file_name+'.xml') img = cv2.imread(dir_img) + + if self.draw_texts_on_image: + out_image_with_text = os.path.join(self.dir_out_image_text, file_name+'.png') + image_text = Image.new("RGB", (img.shape[1], img.shape[0]), "white") + draw = ImageDraw.Draw(image_text) + total_bb_coordinates = [] tree1 = ET.parse(dir_xml, parser = ET.XMLParser(encoding="utf-8")) root1=tree1.getroot() @@ -5283,6 +5309,9 @@ class Eynollah_ocr: x,y,w,h = cv2.boundingRect(textline_coords) + if self.draw_texts_on_image: + total_bb_coordinates.append([x,y,w,h]) + h2w_ratio = h/float(w) img_poly_on_img = np.copy(img) @@ -5359,6 +5388,35 @@ class Eynollah_ocr: extracted_texts_merged = [ind for ind in extracted_texts_merged if ind is not None] unique_cropped_lines_region_indexer = np.unique(cropped_lines_region_indexer) + + + if self.draw_texts_on_image: + + font_path = "NotoSans-Regular.ttf" # Make sure this file exists! + font = ImageFont.truetype(font_path, 40) + + for indexer_text, bb_ind in enumerate(total_bb_coordinates): + + + x_bb = bb_ind[0] + y_bb = bb_ind[1] + w_bb = bb_ind[2] + h_bb = bb_ind[3] + + font = self.fit_text_single_line(draw, extracted_texts_merged[indexer_text], font_path, w_bb, int(h_bb*0.4) ) + + ##draw.rectangle([x_bb, y_bb, x_bb + w_bb, y_bb + h_bb], outline="red", width=2) + + text_bbox = draw.textbbox((0, 0), extracted_texts_merged[indexer_text], font=font) + text_width = text_bbox[2] - text_bbox[0] + text_height = text_bbox[3] - text_bbox[1] + + text_x = x_bb + (w_bb - text_width) // 2 # Center horizontally + text_y = y_bb + (h_bb - text_height) // 2 # Center vertically + + # Draw the text + draw.text((text_x, text_y), extracted_texts_merged[indexer_text], fill="black", font=font) + image_text.save(out_image_with_text) text_by_textregion = [] for ind in unique_cropped_lines_region_indexer: