mirror of
https://github.com/qurator-spk/eynollah.git
synced 2025-06-14 14:49:54 +02:00
In OCR, the predicted text is now drawn on the image, and the results are saved in a specified directory. This makes it easier to review the predicted output
This commit is contained in:
parent
9b04688ebc
commit
b1da0a3327
2 changed files with 75 additions and 3 deletions
|
@ -334,6 +334,12 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_
|
|||
help="directory of xmls",
|
||||
type=click.Path(exists=True, file_okay=False),
|
||||
)
|
||||
@click.option(
|
||||
"--dir_out_image_text",
|
||||
"-doit",
|
||||
help="directory of images with predicted text",
|
||||
type=click.Path(exists=True, file_okay=False),
|
||||
)
|
||||
@click.option(
|
||||
"--model",
|
||||
"-m",
|
||||
|
@ -359,6 +365,12 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_
|
|||
is_flag=True,
|
||||
help="if this parameter set to true, cropped textline images will not be masked with textline contour.",
|
||||
)
|
||||
@click.option(
|
||||
"--draw_texts_on_image",
|
||||
"-dtoi/-ndtoi",
|
||||
is_flag=True,
|
||||
help="if this parameter set to true, the predicted texts will be displayed on an image.",
|
||||
)
|
||||
@click.option(
|
||||
"--log_level",
|
||||
"-l",
|
||||
|
@ -366,18 +378,20 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_
|
|||
help="Override log level globally to this",
|
||||
)
|
||||
|
||||
def ocr(dir_in, out, dir_xmls, model, tr_ocr, export_textline_images_and_text, do_not_mask_with_textline_contour, log_level):
|
||||
def ocr(dir_in, out, dir_xmls, dir_out_image_text, model, tr_ocr, export_textline_images_and_text, do_not_mask_with_textline_contour, draw_texts_on_image, log_level):
|
||||
if log_level:
|
||||
setOverrideLogLevel(log_level)
|
||||
initLogging()
|
||||
eynollah_ocr = Eynollah_ocr(
|
||||
dir_xmls=dir_xmls,
|
||||
dir_out_image_text=dir_out_image_text,
|
||||
dir_in=dir_in,
|
||||
dir_out=out,
|
||||
dir_models=model,
|
||||
tr_ocr=tr_ocr,
|
||||
export_textline_images_and_text=export_textline_images_and_text,
|
||||
do_not_mask_with_textline_contour=do_not_mask_with_textline_contour,
|
||||
draw_texts_on_image=draw_texts_on_image,
|
||||
)
|
||||
eynollah_ocr.run()
|
||||
|
||||
|
|
|
@ -22,7 +22,7 @@ from ocrd_utils import getLogger
|
|||
import cv2
|
||||
import numpy as np
|
||||
from transformers import TrOCRProcessor
|
||||
from PIL import Image
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
import torch
|
||||
from difflib import SequenceMatcher as sq
|
||||
from transformers import VisionEncoderDecoderModel
|
||||
|
@ -4409,7 +4409,6 @@ class Eynollah:
|
|||
text_regions_p_1 ,erosion_hurts, polygons_lines_xml, textline_mask_tot_ea, img_bin_light = \
|
||||
self.get_regions_light_v(img_res, is_image_enhanced, num_col_classifier)
|
||||
#print("text region early -2 in %.1fs", time.time() - t0)
|
||||
|
||||
if num_col_classifier == 1 or num_col_classifier ==2:
|
||||
if num_col_classifier == 1:
|
||||
img_w_new = 1000
|
||||
|
@ -4954,9 +4953,11 @@ class Eynollah_ocr:
|
|||
dir_xmls=None,
|
||||
dir_in=None,
|
||||
dir_out=None,
|
||||
dir_out_image_text=None,
|
||||
tr_ocr=False,
|
||||
export_textline_images_and_text=False,
|
||||
do_not_mask_with_textline_contour=False,
|
||||
draw_texts_on_image=False,
|
||||
logger=None,
|
||||
):
|
||||
self.dir_in = dir_in
|
||||
|
@ -4966,6 +4967,8 @@ class Eynollah_ocr:
|
|||
self.tr_ocr = tr_ocr
|
||||
self.export_textline_images_and_text = export_textline_images_and_text
|
||||
self.do_not_mask_with_textline_contour = do_not_mask_with_textline_contour
|
||||
self.draw_texts_on_image = draw_texts_on_image
|
||||
self.dir_out_image_text = dir_out_image_text
|
||||
if tr_ocr:
|
||||
self.processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")
|
||||
self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
||||
|
@ -5084,6 +5087,23 @@ class Eynollah_ocr:
|
|||
else:
|
||||
return None
|
||||
|
||||
# Function to fit text inside the given area
|
||||
def fit_text_single_line(self, draw, text, font_path, max_width, max_height):
|
||||
initial_font_size = 50
|
||||
font_size = initial_font_size
|
||||
while font_size > 10: # Minimum font size
|
||||
font = ImageFont.truetype(font_path, font_size)
|
||||
text_bbox = draw.textbbox((0, 0), text, font=font) # Get text bounding box
|
||||
text_width = text_bbox[2] - text_bbox[0]
|
||||
text_height = text_bbox[3] - text_bbox[1]
|
||||
|
||||
if text_width <= max_width and text_height <= max_height:
|
||||
return font # Return the best-fitting font
|
||||
|
||||
font_size -= 2 # Reduce font size and retry
|
||||
|
||||
return ImageFont.truetype(font_path, 10) # Smallest font fallback
|
||||
|
||||
def return_textlines_split_if_needed(self, textline_image):
|
||||
|
||||
split_point = self.return_start_and_end_of_common_text_of_textline_ocr_without_common_section(textline_image)
|
||||
|
@ -5255,6 +5275,12 @@ class Eynollah_ocr:
|
|||
out_file_ocr = os.path.join(self.dir_out, file_name+'.xml')
|
||||
img = cv2.imread(dir_img)
|
||||
|
||||
if self.draw_texts_on_image:
|
||||
out_image_with_text = os.path.join(self.dir_out_image_text, file_name+'.png')
|
||||
image_text = Image.new("RGB", (img.shape[1], img.shape[0]), "white")
|
||||
draw = ImageDraw.Draw(image_text)
|
||||
total_bb_coordinates = []
|
||||
|
||||
tree1 = ET.parse(dir_xml, parser = ET.XMLParser(encoding="utf-8"))
|
||||
root1=tree1.getroot()
|
||||
alltags=[elem.tag for elem in root1.iter()]
|
||||
|
@ -5283,6 +5309,9 @@ class Eynollah_ocr:
|
|||
|
||||
x,y,w,h = cv2.boundingRect(textline_coords)
|
||||
|
||||
if self.draw_texts_on_image:
|
||||
total_bb_coordinates.append([x,y,w,h])
|
||||
|
||||
h2w_ratio = h/float(w)
|
||||
|
||||
img_poly_on_img = np.copy(img)
|
||||
|
@ -5360,6 +5389,35 @@ class Eynollah_ocr:
|
|||
extracted_texts_merged = [ind for ind in extracted_texts_merged if ind is not None]
|
||||
unique_cropped_lines_region_indexer = np.unique(cropped_lines_region_indexer)
|
||||
|
||||
|
||||
if self.draw_texts_on_image:
|
||||
|
||||
font_path = "NotoSans-Regular.ttf" # Make sure this file exists!
|
||||
font = ImageFont.truetype(font_path, 40)
|
||||
|
||||
for indexer_text, bb_ind in enumerate(total_bb_coordinates):
|
||||
|
||||
|
||||
x_bb = bb_ind[0]
|
||||
y_bb = bb_ind[1]
|
||||
w_bb = bb_ind[2]
|
||||
h_bb = bb_ind[3]
|
||||
|
||||
font = self.fit_text_single_line(draw, extracted_texts_merged[indexer_text], font_path, w_bb, int(h_bb*0.4) )
|
||||
|
||||
##draw.rectangle([x_bb, y_bb, x_bb + w_bb, y_bb + h_bb], outline="red", width=2)
|
||||
|
||||
text_bbox = draw.textbbox((0, 0), extracted_texts_merged[indexer_text], font=font)
|
||||
text_width = text_bbox[2] - text_bbox[0]
|
||||
text_height = text_bbox[3] - text_bbox[1]
|
||||
|
||||
text_x = x_bb + (w_bb - text_width) // 2 # Center horizontally
|
||||
text_y = y_bb + (h_bb - text_height) // 2 # Center vertically
|
||||
|
||||
# Draw the text
|
||||
draw.text((text_x, text_y), extracted_texts_merged[indexer_text], fill="black", font=font)
|
||||
image_text.save(out_image_with_text)
|
||||
|
||||
text_by_textregion = []
|
||||
for ind in unique_cropped_lines_region_indexer:
|
||||
extracted_texts_merged_un = np.array(extracted_texts_merged)[np.array(cropped_lines_region_indexer)==ind]
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue