mirror of
https://github.com/qurator-spk/eynollah.git
synced 2025-07-12 04:19:55 +02:00
OCR prediction is now enabled to integrate results from both RGB and binarized images or to be performed on each individually
This commit is contained in:
parent
b1da0a3327
commit
4de441eaaa
2 changed files with 70 additions and 8 deletions
|
@ -321,6 +321,12 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_
|
||||||
help="directory of images",
|
help="directory of images",
|
||||||
type=click.Path(exists=True, file_okay=False),
|
type=click.Path(exists=True, file_okay=False),
|
||||||
)
|
)
|
||||||
|
@click.option(
|
||||||
|
"--dir_in_bin",
|
||||||
|
"-dib",
|
||||||
|
help="directory of binarized images. This should be given if you want to do prediction based on both rgb and bin images. And all bin images are png files",
|
||||||
|
type=click.Path(exists=True, file_okay=False),
|
||||||
|
)
|
||||||
@click.option(
|
@click.option(
|
||||||
"--out",
|
"--out",
|
||||||
"-o",
|
"-o",
|
||||||
|
@ -371,6 +377,12 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_
|
||||||
is_flag=True,
|
is_flag=True,
|
||||||
help="if this parameter set to true, the predicted texts will be displayed on an image.",
|
help="if this parameter set to true, the predicted texts will be displayed on an image.",
|
||||||
)
|
)
|
||||||
|
@click.option(
|
||||||
|
"--prediction_with_both_of_rgb_and_bin",
|
||||||
|
"-brb/-nbrb",
|
||||||
|
is_flag=True,
|
||||||
|
help="If this parameter is set to True, the prediction will be performed using both RGB and binary images. However, this does not necessarily improve results; it may be beneficial for certain document images.",
|
||||||
|
)
|
||||||
@click.option(
|
@click.option(
|
||||||
"--log_level",
|
"--log_level",
|
||||||
"-l",
|
"-l",
|
||||||
|
@ -378,7 +390,7 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_
|
||||||
help="Override log level globally to this",
|
help="Override log level globally to this",
|
||||||
)
|
)
|
||||||
|
|
||||||
def ocr(dir_in, out, dir_xmls, dir_out_image_text, model, tr_ocr, export_textline_images_and_text, do_not_mask_with_textline_contour, draw_texts_on_image, log_level):
|
def ocr(dir_in, dir_in_bin, out, dir_xmls, dir_out_image_text, model, tr_ocr, export_textline_images_and_text, do_not_mask_with_textline_contour, draw_texts_on_image, prediction_with_both_of_rgb_and_bin, log_level):
|
||||||
if log_level:
|
if log_level:
|
||||||
setOverrideLogLevel(log_level)
|
setOverrideLogLevel(log_level)
|
||||||
initLogging()
|
initLogging()
|
||||||
|
@ -386,12 +398,14 @@ def ocr(dir_in, out, dir_xmls, dir_out_image_text, model, tr_ocr, export_textlin
|
||||||
dir_xmls=dir_xmls,
|
dir_xmls=dir_xmls,
|
||||||
dir_out_image_text=dir_out_image_text,
|
dir_out_image_text=dir_out_image_text,
|
||||||
dir_in=dir_in,
|
dir_in=dir_in,
|
||||||
|
dir_in_bin=dir_in_bin,
|
||||||
dir_out=out,
|
dir_out=out,
|
||||||
dir_models=model,
|
dir_models=model,
|
||||||
tr_ocr=tr_ocr,
|
tr_ocr=tr_ocr,
|
||||||
export_textline_images_and_text=export_textline_images_and_text,
|
export_textline_images_and_text=export_textline_images_and_text,
|
||||||
do_not_mask_with_textline_contour=do_not_mask_with_textline_contour,
|
do_not_mask_with_textline_contour=do_not_mask_with_textline_contour,
|
||||||
draw_texts_on_image=draw_texts_on_image,
|
draw_texts_on_image=draw_texts_on_image,
|
||||||
|
prediction_with_both_of_rgb_and_bin=prediction_with_both_of_rgb_and_bin,
|
||||||
)
|
)
|
||||||
eynollah_ocr.run()
|
eynollah_ocr.run()
|
||||||
|
|
||||||
|
|
|
@ -4952,15 +4952,18 @@ class Eynollah_ocr:
|
||||||
dir_models,
|
dir_models,
|
||||||
dir_xmls=None,
|
dir_xmls=None,
|
||||||
dir_in=None,
|
dir_in=None,
|
||||||
|
dir_in_bin=None,
|
||||||
dir_out=None,
|
dir_out=None,
|
||||||
dir_out_image_text=None,
|
dir_out_image_text=None,
|
||||||
tr_ocr=False,
|
tr_ocr=False,
|
||||||
export_textline_images_and_text=False,
|
export_textline_images_and_text=False,
|
||||||
do_not_mask_with_textline_contour=False,
|
do_not_mask_with_textline_contour=False,
|
||||||
draw_texts_on_image=False,
|
draw_texts_on_image=False,
|
||||||
|
prediction_with_both_of_rgb_and_bin=False,
|
||||||
logger=None,
|
logger=None,
|
||||||
):
|
):
|
||||||
self.dir_in = dir_in
|
self.dir_in = dir_in
|
||||||
|
self.dir_in_bin = dir_in_bin
|
||||||
self.dir_out = dir_out
|
self.dir_out = dir_out
|
||||||
self.dir_xmls = dir_xmls
|
self.dir_xmls = dir_xmls
|
||||||
self.dir_models = dir_models
|
self.dir_models = dir_models
|
||||||
|
@ -4969,6 +4972,7 @@ class Eynollah_ocr:
|
||||||
self.do_not_mask_with_textline_contour = do_not_mask_with_textline_contour
|
self.do_not_mask_with_textline_contour = do_not_mask_with_textline_contour
|
||||||
self.draw_texts_on_image = draw_texts_on_image
|
self.draw_texts_on_image = draw_texts_on_image
|
||||||
self.dir_out_image_text = dir_out_image_text
|
self.dir_out_image_text = dir_out_image_text
|
||||||
|
self.prediction_with_both_of_rgb_and_bin = prediction_with_both_of_rgb_and_bin
|
||||||
if tr_ocr:
|
if tr_ocr:
|
||||||
self.processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")
|
self.processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")
|
||||||
self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
||||||
|
@ -4977,7 +4981,7 @@ class Eynollah_ocr:
|
||||||
self.model_ocr.to(self.device)
|
self.model_ocr.to(self.device)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
self.model_ocr_dir = dir_models + "/model_step_150000_ocr"#"/model_0_ocr_cnnrnn"#"/model_23_ocr_cnnrnn"
|
self.model_ocr_dir = dir_models + "/model_step_50000_ocr"#"/model_0_ocr_cnnrnn"#"/model_23_ocr_cnnrnn"
|
||||||
model_ocr = load_model(self.model_ocr_dir , compile=False)
|
model_ocr = load_model(self.model_ocr_dir , compile=False)
|
||||||
|
|
||||||
self.prediction_model = tf.keras.models.Model(
|
self.prediction_model = tf.keras.models.Model(
|
||||||
|
@ -5104,15 +5108,20 @@ class Eynollah_ocr:
|
||||||
|
|
||||||
return ImageFont.truetype(font_path, 10) # Smallest font fallback
|
return ImageFont.truetype(font_path, 10) # Smallest font fallback
|
||||||
|
|
||||||
def return_textlines_split_if_needed(self, textline_image):
|
def return_textlines_split_if_needed(self, textline_image, textline_image_bin):
|
||||||
|
|
||||||
split_point = self.return_start_and_end_of_common_text_of_textline_ocr_without_common_section(textline_image)
|
split_point = self.return_start_and_end_of_common_text_of_textline_ocr_without_common_section(textline_image)
|
||||||
if split_point:
|
if split_point:
|
||||||
image1 = textline_image[:, :split_point,:]# image.crop((0, 0, width2, height))
|
image1 = textline_image[:, :split_point,:]# image.crop((0, 0, width2, height))
|
||||||
image2 = textline_image[:, split_point:,:]#image.crop((width1, 0, width, height))
|
image2 = textline_image[:, split_point:,:]#image.crop((width1, 0, width, height))
|
||||||
return [image1, image2]
|
if self.prediction_with_both_of_rgb_and_bin:
|
||||||
|
image1_bin = textline_image_bin[:, :split_point,:]# image.crop((0, 0, width2, height))
|
||||||
|
image2_bin = textline_image_bin[:, split_point:,:]#image.crop((width1, 0, width, height))
|
||||||
|
return [image1, image2], [image1_bin, image2_bin]
|
||||||
|
else:
|
||||||
|
return [image1, image2], None
|
||||||
else:
|
else:
|
||||||
return None
|
return None, None
|
||||||
def preprocess_and_resize_image_for_ocrcnn_model(self, img, image_height, image_width):
|
def preprocess_and_resize_image_for_ocrcnn_model(self, img, image_height, image_width):
|
||||||
ratio = image_height /float(img.shape[0])
|
ratio = image_height /float(img.shape[0])
|
||||||
w_ratio = int(ratio * img.shape[1])
|
w_ratio = int(ratio * img.shape[1])
|
||||||
|
@ -5123,7 +5132,7 @@ class Eynollah_ocr:
|
||||||
|
|
||||||
img = resize_image(img, image_height, width_new)
|
img = resize_image(img, image_height, width_new)
|
||||||
img_fin = np.ones((image_height, image_width, 3))*255
|
img_fin = np.ones((image_height, image_width, 3))*255
|
||||||
img_fin[:,:width_new,:] = img[:,:,:]
|
img_fin[:,:+width_new,:] = img[:,:,:]
|
||||||
img_fin = img_fin / 255.
|
img_fin = img_fin / 255.
|
||||||
return img_fin
|
return img_fin
|
||||||
|
|
||||||
|
@ -5183,7 +5192,7 @@ class Eynollah_ocr:
|
||||||
cropped_lines.append(img_crop)
|
cropped_lines.append(img_crop)
|
||||||
cropped_lines_meging_indexing.append(0)
|
cropped_lines_meging_indexing.append(0)
|
||||||
else:
|
else:
|
||||||
splited_images = self.return_textlines_split_if_needed(img_crop)
|
splited_images, _ = self.return_textlines_split_if_needed(img_crop, None)
|
||||||
#print(splited_images)
|
#print(splited_images)
|
||||||
if splited_images:
|
if splited_images:
|
||||||
cropped_lines.append(splited_images[0])
|
cropped_lines.append(splited_images[0])
|
||||||
|
@ -5274,6 +5283,10 @@ class Eynollah_ocr:
|
||||||
dir_xml = os.path.join(self.dir_xmls, file_name+'.xml')
|
dir_xml = os.path.join(self.dir_xmls, file_name+'.xml')
|
||||||
out_file_ocr = os.path.join(self.dir_out, file_name+'.xml')
|
out_file_ocr = os.path.join(self.dir_out, file_name+'.xml')
|
||||||
img = cv2.imread(dir_img)
|
img = cv2.imread(dir_img)
|
||||||
|
if self.prediction_with_both_of_rgb_and_bin:
|
||||||
|
cropped_lines_bin = []
|
||||||
|
dir_img_bin = os.path.join(self.dir_in_bin, file_name+'.png')
|
||||||
|
img_bin = cv2.imread(dir_img_bin)
|
||||||
|
|
||||||
if self.draw_texts_on_image:
|
if self.draw_texts_on_image:
|
||||||
out_image_with_text = os.path.join(self.dir_out_image_text, file_name+'.png')
|
out_image_with_text = os.path.join(self.dir_out_image_text, file_name+'.png')
|
||||||
|
@ -5315,6 +5328,10 @@ class Eynollah_ocr:
|
||||||
h2w_ratio = h/float(w)
|
h2w_ratio = h/float(w)
|
||||||
|
|
||||||
img_poly_on_img = np.copy(img)
|
img_poly_on_img = np.copy(img)
|
||||||
|
if self.prediction_with_both_of_rgb_and_bin:
|
||||||
|
img_poly_on_img_bin = np.copy(img_bin)
|
||||||
|
img_crop_bin = img_poly_on_img_bin[y:y+h, x:x+w, :]
|
||||||
|
|
||||||
mask_poly = np.zeros(img.shape)
|
mask_poly = np.zeros(img.shape)
|
||||||
mask_poly = cv2.fillPoly(mask_poly, pts=[textline_coords], color=(1, 1, 1))
|
mask_poly = cv2.fillPoly(mask_poly, pts=[textline_coords], color=(1, 1, 1))
|
||||||
|
|
||||||
|
@ -5322,14 +5339,22 @@ class Eynollah_ocr:
|
||||||
img_crop = img_poly_on_img[y:y+h, x:x+w, :]
|
img_crop = img_poly_on_img[y:y+h, x:x+w, :]
|
||||||
if not self.do_not_mask_with_textline_contour:
|
if not self.do_not_mask_with_textline_contour:
|
||||||
img_crop[mask_poly==0] = 255
|
img_crop[mask_poly==0] = 255
|
||||||
|
if self.prediction_with_both_of_rgb_and_bin:
|
||||||
|
img_crop_bin[mask_poly==0] = 255
|
||||||
|
|
||||||
if not self.export_textline_images_and_text:
|
if not self.export_textline_images_and_text:
|
||||||
if h2w_ratio > 0.1:
|
if h2w_ratio > 0.1:
|
||||||
img_fin = self.preprocess_and_resize_image_for_ocrcnn_model(img_crop, image_height, image_width)
|
img_fin = self.preprocess_and_resize_image_for_ocrcnn_model(img_crop, image_height, image_width)
|
||||||
cropped_lines.append(img_fin)
|
cropped_lines.append(img_fin)
|
||||||
cropped_lines_meging_indexing.append(0)
|
cropped_lines_meging_indexing.append(0)
|
||||||
|
if self.prediction_with_both_of_rgb_and_bin:
|
||||||
|
img_fin = self.preprocess_and_resize_image_for_ocrcnn_model(img_crop_bin, image_height, image_width)
|
||||||
|
cropped_lines_bin.append(img_fin)
|
||||||
else:
|
else:
|
||||||
splited_images = self.return_textlines_split_if_needed(img_crop)
|
if self.prediction_with_both_of_rgb_and_bin:
|
||||||
|
splited_images, splited_images_bin = self.return_textlines_split_if_needed(img_crop, img_crop_bin)
|
||||||
|
else:
|
||||||
|
splited_images, splited_images_bin = self.return_textlines_split_if_needed(img_crop, None)
|
||||||
if splited_images:
|
if splited_images:
|
||||||
img_fin = self.preprocess_and_resize_image_for_ocrcnn_model(splited_images[0], image_height, image_width)
|
img_fin = self.preprocess_and_resize_image_for_ocrcnn_model(splited_images[0], image_height, image_width)
|
||||||
cropped_lines.append(img_fin)
|
cropped_lines.append(img_fin)
|
||||||
|
@ -5338,10 +5363,21 @@ class Eynollah_ocr:
|
||||||
|
|
||||||
cropped_lines.append(img_fin)
|
cropped_lines.append(img_fin)
|
||||||
cropped_lines_meging_indexing.append(-1)
|
cropped_lines_meging_indexing.append(-1)
|
||||||
|
|
||||||
|
if self.prediction_with_both_of_rgb_and_bin:
|
||||||
|
img_fin = self.preprocess_and_resize_image_for_ocrcnn_model(splited_images_bin[0], image_height, image_width)
|
||||||
|
cropped_lines_bin.append(img_fin)
|
||||||
|
img_fin = self.preprocess_and_resize_image_for_ocrcnn_model(splited_images_bin[1], image_height, image_width)
|
||||||
|
cropped_lines_bin.append(img_fin)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
img_fin = self.preprocess_and_resize_image_for_ocrcnn_model(img_crop, image_height, image_width)
|
img_fin = self.preprocess_and_resize_image_for_ocrcnn_model(img_crop, image_height, image_width)
|
||||||
cropped_lines.append(img_fin)
|
cropped_lines.append(img_fin)
|
||||||
cropped_lines_meging_indexing.append(0)
|
cropped_lines_meging_indexing.append(0)
|
||||||
|
|
||||||
|
if self.prediction_with_both_of_rgb_and_bin:
|
||||||
|
img_fin = self.preprocess_and_resize_image_for_ocrcnn_model(img_crop_bin, image_height, image_width)
|
||||||
|
cropped_lines_bin.append(img_fin)
|
||||||
|
|
||||||
if self.export_textline_images_and_text:
|
if self.export_textline_images_and_text:
|
||||||
if child_textlines.tag.endswith("TextEquiv"):
|
if child_textlines.tag.endswith("TextEquiv"):
|
||||||
|
@ -5370,14 +5406,26 @@ class Eynollah_ocr:
|
||||||
imgs = cropped_lines[n_start:]
|
imgs = cropped_lines[n_start:]
|
||||||
imgs = np.array(imgs)
|
imgs = np.array(imgs)
|
||||||
imgs = imgs.reshape(imgs.shape[0], image_height, image_width, 3)
|
imgs = imgs.reshape(imgs.shape[0], image_height, image_width, 3)
|
||||||
|
if self.prediction_with_both_of_rgb_and_bin:
|
||||||
|
imgs_bin = cropped_lines_bin[n_start:]
|
||||||
|
imgs_bin = np.array(imgs_bin)
|
||||||
|
imgs_bin = imgs_bin.reshape(imgs_bin.shape[0], image_height, image_width, 3)
|
||||||
else:
|
else:
|
||||||
n_start = i*b_s
|
n_start = i*b_s
|
||||||
n_end = (i+1)*b_s
|
n_end = (i+1)*b_s
|
||||||
imgs = cropped_lines[n_start:n_end]
|
imgs = cropped_lines[n_start:n_end]
|
||||||
imgs = np.array(imgs).reshape(b_s, image_height, image_width, 3)
|
imgs = np.array(imgs).reshape(b_s, image_height, image_width, 3)
|
||||||
|
|
||||||
|
if self.prediction_with_both_of_rgb_and_bin:
|
||||||
|
imgs_bin = cropped_lines_bin[n_start:n_end]
|
||||||
|
imgs_bin = np.array(imgs_bin).reshape(b_s, image_height, image_width, 3)
|
||||||
|
|
||||||
|
|
||||||
preds = self.prediction_model.predict(imgs, verbose=0)
|
preds = self.prediction_model.predict(imgs, verbose=0)
|
||||||
|
if self.prediction_with_both_of_rgb_and_bin:
|
||||||
|
preds_bin = self.prediction_model.predict(imgs_bin, verbose=0)
|
||||||
|
preds = (preds + preds_bin) / 2.
|
||||||
|
|
||||||
pred_texts = self.decode_batch_predictions(preds)
|
pred_texts = self.decode_batch_predictions(preds)
|
||||||
|
|
||||||
for ib in range(imgs.shape[0]):
|
for ib in range(imgs.shape[0]):
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue