In OCR, the predicted text is now drawn on the image, and the results are saved in a specified directory. This makes it easier to review the predicted output

2025-12-16 08:04:14 +01:00 · 2025-03-31 18:43:14 +02:00 · 2025-03-31 18:43:14 +02:00 · b1da0a3327
commit b1da0a3327
parent 9b04688ebc
2 changed files with 75 additions and 3 deletions
--- a/src/eynollah/cli.py
+++ b/src/eynollah/cli.py
@ -334,6 +334,12 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_
    help="directory of xmls",
    type=click.Path(exists=True, file_okay=False),
 )
@click.option(
    "--dir_out_image_text",
    "-doit",
    help="directory of images with predicted text",
    type=click.Path(exists=True, file_okay=False),
 )
@click.option(
    "--model",
    "-m",
@ -359,6 +365,12 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_
    is_flag=True,
    help="if this parameter set to true, cropped textline images will not be masked with textline contour.",
 )
@click.option(
    "--draw_texts_on_image",
    "-dtoi/-ndtoi",
    is_flag=True,
    help="if this parameter set to true, the predicted texts will be displayed on an image.",
 )
@click.option(
    "--log_level",
    "-l",
@ -366,18 +378,20 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_
    help="Override log level globally to this",
 )
-def ocr(dir_in, out, dir_xmls, model, tr_ocr, export_textline_images_and_text, do_not_mask_with_textline_contour, log_level):
+def ocr(dir_in, out, dir_xmls, dir_out_image_text, model, tr_ocr, export_textline_images_and_text, do_not_mask_with_textline_contour, draw_texts_on_image, log_level):
    if log_level:
        setOverrideLogLevel(log_level)
    initLogging()
    eynollah_ocr = Eynollah_ocr(
        dir_xmls=dir_xmls,
        dir_out_image_text=dir_out_image_text,
        dir_in=dir_in,
        dir_out=out,
        dir_models=model,
        tr_ocr=tr_ocr,
        export_textline_images_and_text=export_textline_images_and_text,
        do_not_mask_with_textline_contour=do_not_mask_with_textline_contour,
        draw_texts_on_image=draw_texts_on_image,
    )
    eynollah_ocr.run()
--- a/src/eynollah/eynollah.py
+++ b/src/eynollah/eynollah.py
@ -22,7 +22,7 @@ from ocrd_utils import getLogger
 import cv2
 import numpy as np
 from transformers import TrOCRProcessor
-from PIL import Image
+from PIL import Image, ImageDraw, ImageFont
 import torch
 from difflib import SequenceMatcher as sq
 from transformers import VisionEncoderDecoderModel
@ -4409,7 +4409,6 @@ class Eynollah:
                text_regions_p_1 ,erosion_hurts, polygons_lines_xml, textline_mask_tot_ea, img_bin_light = \
                    self.get_regions_light_v(img_res, is_image_enhanced, num_col_classifier)
                #print("text region early -2 in %.1fs", time.time() - t0)
                if num_col_classifier == 1 or num_col_classifier ==2:
                    if num_col_classifier == 1:
                        img_w_new = 1000
@ -4954,9 +4953,11 @@ class Eynollah_ocr:
        dir_xmls=None,
        dir_in=None,
        dir_out=None,
        dir_out_image_text=None,
        tr_ocr=False,
        export_textline_images_and_text=False,
        do_not_mask_with_textline_contour=False,
        draw_texts_on_image=False,
        logger=None,
    ):
        self.dir_in = dir_in
@ -4966,6 +4967,8 @@ class Eynollah_ocr:
        self.tr_ocr = tr_ocr
        self.export_textline_images_and_text = export_textline_images_and_text
        self.do_not_mask_with_textline_contour = do_not_mask_with_textline_contour
        self.draw_texts_on_image = draw_texts_on_image
        self.dir_out_image_text = dir_out_image_text
        if tr_ocr:
            self.processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")
            self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
@ -5083,6 +5086,23 @@ class Eynollah_ocr:
            return peaks_final
        else:
            return None
    # Function to fit text inside the given area
    def fit_text_single_line(self, draw, text, font_path, max_width, max_height):
        initial_font_size = 50
        font_size = initial_font_size
        while font_size > 10:  # Minimum font size
            font = ImageFont.truetype(font_path, font_size)
            text_bbox = draw.textbbox((0, 0), text, font=font)  # Get text bounding box
            text_width = text_bbox[2] - text_bbox[0]
            text_height = text_bbox[3] - text_bbox[1]
            if text_width <= max_width and text_height <= max_height:
                return font  # Return the best-fitting font
            font_size -= 2  # Reduce font size and retry
        return ImageFont.truetype(font_path, 10)  # Smallest font fallback
    def return_textlines_split_if_needed(self, textline_image):
@ -5254,6 +5274,12 @@ class Eynollah_ocr:
                dir_xml = os.path.join(self.dir_xmls, file_name+'.xml')
                out_file_ocr = os.path.join(self.dir_out, file_name+'.xml')
                img = cv2.imread(dir_img)
                if self.draw_texts_on_image:
                    out_image_with_text = os.path.join(self.dir_out_image_text, file_name+'.png')
                    image_text = Image.new("RGB", (img.shape[1], img.shape[0]), "white")
                    draw = ImageDraw.Draw(image_text)
                    total_bb_coordinates = []
                tree1 = ET.parse(dir_xml, parser = ET.XMLParser(encoding="utf-8"))
                root1=tree1.getroot()
@ -5283,6 +5309,9 @@ class Eynollah_ocr:
                                    x,y,w,h = cv2.boundingRect(textline_coords)
                                    if self.draw_texts_on_image:
                                        total_bb_coordinates.append([x,y,w,h])
                                    h2w_ratio = h/float(w)
                                    img_poly_on_img = np.copy(img)
@ -5359,6 +5388,35 @@ class Eynollah_ocr:
                    extracted_texts_merged = [ind for ind in extracted_texts_merged if ind is not None]
                    unique_cropped_lines_region_indexer = np.unique(cropped_lines_region_indexer)
                    if self.draw_texts_on_image:
                        font_path = "NotoSans-Regular.ttf"  # Make sure this file exists!
                        font = ImageFont.truetype(font_path, 40)
                        for indexer_text, bb_ind in enumerate(total_bb_coordinates):
                            x_bb = bb_ind[0]
                            y_bb = bb_ind[1]
                            w_bb = bb_ind[2]
                            h_bb = bb_ind[3]
                            font = self.fit_text_single_line(draw, extracted_texts_merged[indexer_text], font_path, w_bb, int(h_bb*0.4) )
                            ##draw.rectangle([x_bb, y_bb, x_bb + w_bb, y_bb + h_bb], outline="red", width=2)
                            text_bbox = draw.textbbox((0, 0), extracted_texts_merged[indexer_text], font=font)
                            text_width = text_bbox[2] - text_bbox[0]
                            text_height = text_bbox[3] - text_bbox[1]
                            text_x = x_bb + (w_bb - text_width) // 2  # Center horizontally
                            text_y = y_bb + (h_bb - text_height) // 2  # Center vertically
                            # Draw the text
                            draw.text((text_x, text_y), extracted_texts_merged[indexer_text], fill="black", font=font)
                        image_text.save(out_image_with_text)
                    text_by_textregion = []
                    for ind in unique_cropped_lines_region_indexer: