diff --git a/src/eynollah/cli/cli_ocr.py b/src/eynollah/cli/cli_ocr.py index 962ee9b..9bb8620 100644 --- a/src/eynollah/cli/cli_ocr.py +++ b/src/eynollah/cli/cli_ocr.py @@ -59,12 +59,6 @@ import click is_flag=True, help="if this parameter set to true, transformer ocr will be applied, otherwise cnn_rnn model.", ) -@click.option( - "--export_textline_images_and_text", - "-etit/-noetit", - is_flag=True, - help="if this parameter set to true, images and text in xml will be exported into output dir. This files can be used for training a OCR engine.", -) @click.option( "--do_not_mask_with_textline_contour", "-nmtc/-mtc", @@ -76,11 +70,6 @@ import click "-bs", help="number of inference batch size. Default b_s for trocr and cnn_rnn models are 2 and 8 respectively", ) -@click.option( - "--dataset_abbrevation", - "-ds_pref", - help="in the case of extracting textline and text from a xml GT file user can add an abbrevation of dataset name to generated dataset", -) @click.option( "--min_conf_value_of_textline_text", "-min_conf", @@ -97,7 +86,6 @@ def ocr_cli( dir_out_image_text, overwrite, tr_ocr, - export_textline_images_and_text, do_not_mask_with_textline_contour, batch_size, dataset_abbrevation, @@ -106,18 +94,11 @@ def ocr_cli( """ Recognize text with a CNN/RNN or transformer ML model. """ - assert not export_textline_images_and_text or not tr_ocr, "Exporting textline and text -etit can not be set alongside transformer ocr -tr_ocr" - # FIXME: refactor: move export_textline_images_and_text out of eynollah.py - # assert not export_textline_images_and_text or not model, "Exporting textline and text -etit can not be set alongside model -m" - assert not export_textline_images_and_text or not batch_size, "Exporting textline and text -etit can not be set alongside batch size -bs" - assert not export_textline_images_and_text or not dir_in_bin, "Exporting textline and text -etit can not be set alongside directory of bin images -dib" - assert not export_textline_images_and_text or not dir_out_image_text, "Exporting textline and text -etit can not be set alongside directory of images with predicted text -doit" - assert bool(image) != bool(dir_in), "Either -i (single image) or -di (directory) must be provided, but not both." + assert bool(image) ^ bool(dir_in), "Either -i (single image) or -di (directory) must be provided, but not both." from ..eynollah_ocr import Eynollah_ocr eynollah_ocr = Eynollah_ocr( model_zoo=ctx.obj.model_zoo, tr_ocr=tr_ocr, - export_textline_images_and_text=export_textline_images_and_text, do_not_mask_with_textline_contour=do_not_mask_with_textline_contour, batch_size=batch_size, pref_of_dataset=dataset_abbrevation, diff --git a/src/eynollah/eynollah_ocr.py b/src/eynollah/eynollah_ocr.py index 61de12c..52dcca9 100644 --- a/src/eynollah/eynollah_ocr.py +++ b/src/eynollah/eynollah_ocr.py @@ -9,17 +9,13 @@ from logging import Logger, getLogger from typing import Optional from pathlib import Path import os -import json import gc import sys import math import time -from keras.layers import StringLookup import cv2 import xml.etree.ElementTree as ET -import tensorflow as tf -from keras.models import load_model from PIL import Image, ImageDraw, ImageFont import numpy as np from eynollah.model_zoo import EynollahModelZoo @@ -48,11 +44,6 @@ if sys.version_info < (3, 10): else: import importlib.resources as importlib_resources -try: - from transformers import TrOCRProcessor, VisionEncoderDecoderModel -except ImportError: - TrOCRProcessor = VisionEncoderDecoderModel = None - class Eynollah_ocr: def __init__( self, @@ -60,27 +51,16 @@ class Eynollah_ocr: model_zoo: EynollahModelZoo, tr_ocr=False, batch_size: Optional[int]=None, - export_textline_images_and_text: bool=False, do_not_mask_with_textline_contour: bool=False, - pref_of_dataset=None, min_conf_value_of_textline_text : Optional[float]=None, logger: Optional[Logger]=None, ): self.tr_ocr = tr_ocr - # For generating textline-image pairs for traning, move to generate_gt_for_training - self.export_textline_images_and_text = export_textline_images_and_text # masking for OCR and GT generation, relevant for skewed lines and bounding boxes self.do_not_mask_with_textline_contour = do_not_mask_with_textline_contour - # prefix or dataset - self.pref_of_dataset = pref_of_dataset self.logger = logger if logger else getLogger('eynollah.ocr') self.model_zoo = model_zoo - # TODO: Properly document what 'export_textline_images_and_text' is about - if export_textline_images_and_text: - self.logger.info("export_textline_images_and_text was set, so no actual models are loaded") - return - self.min_conf_value_of_textline_text = min_conf_value_of_textline_text if min_conf_value_of_textline_text else 0.3 self.b_s = 2 if batch_size is None and tr_ocr else 8 if batch_size is None else batch_size @@ -539,40 +519,55 @@ class Eynollah_ocr: mask_poly = mask_poly[y:y+h, x:x+w, :] img_crop = img_poly_on_img[y:y+h, x:x+w, :] - if self.export_textline_images_and_text: + # print(file_name, angle_degrees, w*h, + # mask_poly[:,:,0].sum(), + # mask_poly[:,:,0].sum() /float(w*h) , + # 'didi') + + if angle_degrees > 3: + better_des_slope = get_orientation_moments(textline_coords) + + img_crop = rotate_image_with_padding(img_crop, better_des_slope) + if dir_in_bin is not None: + img_crop_bin = rotate_image_with_padding(img_crop_bin, better_des_slope) + + mask_poly = rotate_image_with_padding(mask_poly, better_des_slope) + mask_poly = mask_poly.astype('uint8') + + #new bounding box + x_n, y_n, w_n, h_n = get_contours_and_bounding_boxes(mask_poly[:,:,0]) + + mask_poly = mask_poly[y_n:y_n+h_n, x_n:x_n+w_n, :] + img_crop = img_crop[y_n:y_n+h_n, x_n:x_n+w_n, :] + if not self.do_not_mask_with_textline_contour: img_crop[mask_poly==0] = 255 - - else: - # print(file_name, angle_degrees, w*h, - # mask_poly[:,:,0].sum(), - # mask_poly[:,:,0].sum() /float(w*h) , - # 'didi') - - if angle_degrees > 3: - better_des_slope = get_orientation_moments(textline_coords) - - img_crop = rotate_image_with_padding(img_crop, better_des_slope) - if dir_in_bin is not None: - img_crop_bin = rotate_image_with_padding(img_crop_bin, better_des_slope) - - mask_poly = rotate_image_with_padding(mask_poly, better_des_slope) - mask_poly = mask_poly.astype('uint8') - - #new bounding box - x_n, y_n, w_n, h_n = get_contours_and_bounding_boxes(mask_poly[:,:,0]) - - mask_poly = mask_poly[y_n:y_n+h_n, x_n:x_n+w_n, :] - img_crop = img_crop[y_n:y_n+h_n, x_n:x_n+w_n, :] - + if dir_in_bin is not None: + img_crop_bin = img_crop_bin[y_n:y_n+h_n, x_n:x_n+w_n, :] if not self.do_not_mask_with_textline_contour: - img_crop[mask_poly==0] = 255 + img_crop_bin[mask_poly==0] = 255 + + if mask_poly[:,:,0].sum() /float(w_n*h_n) < 0.50 and w_scaled > 90: if dir_in_bin is not None: - img_crop_bin = img_crop_bin[y_n:y_n+h_n, x_n:x_n+w_n, :] - if not self.do_not_mask_with_textline_contour: - img_crop_bin[mask_poly==0] = 255 - - if mask_poly[:,:,0].sum() /float(w_n*h_n) < 0.50 and w_scaled > 90: + img_crop, img_crop_bin = \ + break_curved_line_into_small_pieces_and_then_merge( + img_crop, mask_poly, img_crop_bin) + else: + img_crop, _ = \ + break_curved_line_into_small_pieces_and_then_merge( + img_crop, mask_poly) + + else: + better_des_slope = 0 + if not self.do_not_mask_with_textline_contour: + img_crop[mask_poly==0] = 255 + if dir_in_bin is not None: + if not self.do_not_mask_with_textline_contour: + img_crop_bin[mask_poly==0] = 255 + if type_textregion=='drop-capital': + pass + else: + if mask_poly[:,:,0].sum() /float(w*h) < 0.50 and w_scaled > 90: if dir_in_bin is not None: img_crop, img_crop_bin = \ break_curved_line_into_small_pieces_and_then_merge( @@ -581,188 +576,178 @@ class Eynollah_ocr: img_crop, _ = \ break_curved_line_into_small_pieces_and_then_merge( img_crop, mask_poly) - - else: - better_des_slope = 0 - if not self.do_not_mask_with_textline_contour: - img_crop[mask_poly==0] = 255 - if dir_in_bin is not None: - if not self.do_not_mask_with_textline_contour: - img_crop_bin[mask_poly==0] = 255 - if type_textregion=='drop-capital': - pass - else: - if mask_poly[:,:,0].sum() /float(w*h) < 0.50 and w_scaled > 90: - if dir_in_bin is not None: - img_crop, img_crop_bin = \ - break_curved_line_into_small_pieces_and_then_merge( - img_crop, mask_poly, img_crop_bin) - else: - img_crop, _ = \ - break_curved_line_into_small_pieces_and_then_merge( - img_crop, mask_poly) - if not self.export_textline_images_and_text: - if w_scaled < 750:#1.5*image_width: + if w_scaled < 750:#1.5*image_width: + img_fin = preprocess_and_resize_image_for_ocrcnn_model( + img_crop, image_height, image_width) + cropped_lines.append(img_fin) + if abs(better_des_slope) > 45: + cropped_lines_ver_index.append(1) + else: + cropped_lines_ver_index.append(0) + + cropped_lines_meging_indexing.append(0) + if dir_in_bin is not None: img_fin = preprocess_and_resize_image_for_ocrcnn_model( - img_crop, image_height, image_width) + img_crop_bin, image_height, image_width) + cropped_lines_bin.append(img_fin) + else: + splited_images, splited_images_bin = return_textlines_split_if_needed( + img_crop, img_crop_bin if dir_in_bin is not None else None) + if splited_images: + img_fin = preprocess_and_resize_image_for_ocrcnn_model( + splited_images[0], image_height, image_width) cropped_lines.append(img_fin) + cropped_lines_meging_indexing.append(1) + if abs(better_des_slope) > 45: cropped_lines_ver_index.append(1) else: cropped_lines_ver_index.append(0) + + img_fin = preprocess_and_resize_image_for_ocrcnn_model( + splited_images[1], image_height, image_width) + + cropped_lines.append(img_fin) + cropped_lines_meging_indexing.append(-1) + + if abs(better_des_slope) > 45: + cropped_lines_ver_index.append(1) + else: + cropped_lines_ver_index.append(0) + + if dir_in_bin is not None: + img_fin = preprocess_and_resize_image_for_ocrcnn_model( + splited_images_bin[0], image_height, image_width) + cropped_lines_bin.append(img_fin) + img_fin = preprocess_and_resize_image_for_ocrcnn_model( + splited_images_bin[1], image_height, image_width) + cropped_lines_bin.append(img_fin) + else: + img_fin = preprocess_and_resize_image_for_ocrcnn_model( + img_crop, image_height, image_width) + cropped_lines.append(img_fin) cropped_lines_meging_indexing.append(0) + + if abs(better_des_slope) > 45: + cropped_lines_ver_index.append(1) + else: + cropped_lines_ver_index.append(0) + if dir_in_bin is not None: img_fin = preprocess_and_resize_image_for_ocrcnn_model( img_crop_bin, image_height, image_width) cropped_lines_bin.append(img_fin) - else: - splited_images, splited_images_bin = return_textlines_split_if_needed( - img_crop, img_crop_bin if dir_in_bin is not None else None) - if splited_images: - img_fin = preprocess_and_resize_image_for_ocrcnn_model( - splited_images[0], image_height, image_width) - cropped_lines.append(img_fin) - cropped_lines_meging_indexing.append(1) - - if abs(better_des_slope) > 45: - cropped_lines_ver_index.append(1) - else: - cropped_lines_ver_index.append(0) - - img_fin = preprocess_and_resize_image_for_ocrcnn_model( - splited_images[1], image_height, image_width) - - cropped_lines.append(img_fin) - cropped_lines_meging_indexing.append(-1) - - if abs(better_des_slope) > 45: - cropped_lines_ver_index.append(1) - else: - cropped_lines_ver_index.append(0) - - if dir_in_bin is not None: - img_fin = preprocess_and_resize_image_for_ocrcnn_model( - splited_images_bin[0], image_height, image_width) - cropped_lines_bin.append(img_fin) - img_fin = preprocess_and_resize_image_for_ocrcnn_model( - splited_images_bin[1], image_height, image_width) - cropped_lines_bin.append(img_fin) - - else: - img_fin = preprocess_and_resize_image_for_ocrcnn_model( - img_crop, image_height, image_width) - cropped_lines.append(img_fin) - cropped_lines_meging_indexing.append(0) - - if abs(better_des_slope) > 45: - cropped_lines_ver_index.append(1) - else: - cropped_lines_ver_index.append(0) - - if dir_in_bin is not None: - img_fin = preprocess_and_resize_image_for_ocrcnn_model( - img_crop_bin, image_height, image_width) - cropped_lines_bin.append(img_fin) - - if self.export_textline_images_and_text: - if img_crop.shape[0]==0 or img_crop.shape[1]==0: - pass - else: - if child_textlines.tag.endswith("TextEquiv"): - for cheild_text in child_textlines: - if cheild_text.tag.endswith("Unicode"): - textline_text = cheild_text.text - if textline_text: - base_name = os.path.join( - dir_out, file_name + '_line_' + str(indexer_textlines)) - if self.pref_of_dataset: - base_name += '_' + self.pref_of_dataset - if not self.do_not_mask_with_textline_contour: - base_name += '_masked' - - with open(base_name + '.txt', 'w') as text_file: - text_file.write(textline_text) - cv2.imwrite(base_name + '.png', img_crop) - indexer_textlines+=1 - - if not self.export_textline_images_and_text: - indexer_text_region = indexer_text_region +1 - - if not self.export_textline_images_and_text: - extracted_texts = [] - extracted_conf_value = [] - - n_iterations = math.ceil(len(cropped_lines) / self.b_s) - - for i in range(n_iterations): - if i==(n_iterations-1): - n_start = i*self.b_s - imgs = cropped_lines[n_start:] - imgs = np.array(imgs) - imgs = imgs.reshape(imgs.shape[0], image_height, image_width, 3) - - ver_imgs = np.array( cropped_lines_ver_index[n_start:] ) - indices_ver = np.where(ver_imgs == 1)[0] - - #print(indices_ver, 'indices_ver') - if len(indices_ver)>0: - imgs_ver_flipped = imgs[indices_ver, : ,: ,:] - imgs_ver_flipped = imgs_ver_flipped[:,::-1,::-1,:] - #print(imgs_ver_flipped, 'imgs_ver_flipped') - - else: - imgs_ver_flipped = None - - if dir_in_bin is not None: - imgs_bin = cropped_lines_bin[n_start:] - imgs_bin = np.array(imgs_bin) - imgs_bin = imgs_bin.reshape(imgs_bin.shape[0], image_height, image_width, 3) - - if len(indices_ver)>0: - imgs_bin_ver_flipped = imgs_bin[indices_ver, : ,: ,:] - imgs_bin_ver_flipped = imgs_bin_ver_flipped[:,::-1,::-1,:] - #print(imgs_ver_flipped, 'imgs_ver_flipped') - else: - imgs_bin_ver_flipped = None - else: - n_start = i*self.b_s - n_end = (i+1)*self.b_s - imgs = cropped_lines[n_start:n_end] - imgs = np.array(imgs).reshape(self.b_s, image_height, image_width, 3) + + indexer_text_region = indexer_text_region +1 + + extracted_texts = [] + extracted_conf_value = [] + + n_iterations = math.ceil(len(cropped_lines) / self.b_s) + + for i in range(n_iterations): + if i==(n_iterations-1): + n_start = i*self.b_s + imgs = cropped_lines[n_start:] + imgs = np.array(imgs) + imgs = imgs.reshape(imgs.shape[0], image_height, image_width, 3) + + ver_imgs = np.array( cropped_lines_ver_index[n_start:] ) + indices_ver = np.where(ver_imgs == 1)[0] + + #print(indices_ver, 'indices_ver') + if len(indices_ver)>0: + imgs_ver_flipped = imgs[indices_ver, : ,: ,:] + imgs_ver_flipped = imgs_ver_flipped[:,::-1,::-1,:] + #print(imgs_ver_flipped, 'imgs_ver_flipped') - ver_imgs = np.array( cropped_lines_ver_index[n_start:n_end] ) - indices_ver = np.where(ver_imgs == 1)[0] - #print(indices_ver, 'indices_ver') + else: + imgs_ver_flipped = None + + if dir_in_bin is not None: + imgs_bin = cropped_lines_bin[n_start:] + imgs_bin = np.array(imgs_bin) + imgs_bin = imgs_bin.reshape(imgs_bin.shape[0], image_height, image_width, 3) if len(indices_ver)>0: - imgs_ver_flipped = imgs[indices_ver, : ,: ,:] - imgs_ver_flipped = imgs_ver_flipped[:,::-1,::-1,:] + imgs_bin_ver_flipped = imgs_bin[indices_ver, : ,: ,:] + imgs_bin_ver_flipped = imgs_bin_ver_flipped[:,::-1,::-1,:] #print(imgs_ver_flipped, 'imgs_ver_flipped') + else: - imgs_ver_flipped = None - - - if dir_in_bin is not None: - imgs_bin = cropped_lines_bin[n_start:n_end] - imgs_bin = np.array(imgs_bin).reshape(self.b_s, image_height, image_width, 3) - - - if len(indices_ver)>0: - imgs_bin_ver_flipped = imgs_bin[indices_ver, : ,: ,:] - imgs_bin_ver_flipped = imgs_bin_ver_flipped[:,::-1,::-1,:] - #print(imgs_ver_flipped, 'imgs_ver_flipped') - else: - imgs_bin_ver_flipped = None - - - self.logger.debug("processing next %d lines", len(imgs)) - preds = self.model_zoo.get('ocr').predict(imgs, verbose=0) + imgs_bin_ver_flipped = None + else: + n_start = i*self.b_s + n_end = (i+1)*self.b_s + imgs = cropped_lines[n_start:n_end] + imgs = np.array(imgs).reshape(self.b_s, image_height, image_width, 3) + + ver_imgs = np.array( cropped_lines_ver_index[n_start:n_end] ) + indices_ver = np.where(ver_imgs == 1)[0] + #print(indices_ver, 'indices_ver') if len(indices_ver)>0: - preds_flipped = self.model_zoo.get('ocr').predict(imgs_ver_flipped, verbose=0) + imgs_ver_flipped = imgs[indices_ver, : ,: ,:] + imgs_ver_flipped = imgs_ver_flipped[:,::-1,::-1,:] + #print(imgs_ver_flipped, 'imgs_ver_flipped') + else: + imgs_ver_flipped = None + + + if dir_in_bin is not None: + imgs_bin = cropped_lines_bin[n_start:n_end] + imgs_bin = np.array(imgs_bin).reshape(self.b_s, image_height, image_width, 3) + + + if len(indices_ver)>0: + imgs_bin_ver_flipped = imgs_bin[indices_ver, : ,: ,:] + imgs_bin_ver_flipped = imgs_bin_ver_flipped[:,::-1,::-1,:] + #print(imgs_ver_flipped, 'imgs_ver_flipped') + else: + imgs_bin_ver_flipped = None + + + self.logger.debug("processing next %d lines", len(imgs)) + preds = self.model_zoo.get('ocr').predict(imgs, verbose=0) + + if len(indices_ver)>0: + preds_flipped = self.model_zoo.get('ocr').predict(imgs_ver_flipped, verbose=0) + preds_max_fliped = np.max(preds_flipped, axis=2 ) + preds_max_args_flipped = np.argmax(preds_flipped, axis=2 ) + pred_max_not_unk_mask_bool_flipped = preds_max_args_flipped[:,:]!=self.end_character + masked_means_flipped = \ + np.sum(preds_max_fliped * pred_max_not_unk_mask_bool_flipped, axis=1) / \ + np.sum(pred_max_not_unk_mask_bool_flipped, axis=1) + masked_means_flipped[np.isnan(masked_means_flipped)] = 0 + + preds_max = np.max(preds, axis=2 ) + preds_max_args = np.argmax(preds, axis=2 ) + pred_max_not_unk_mask_bool = preds_max_args[:,:]!=self.end_character + + masked_means = \ + np.sum(preds_max * pred_max_not_unk_mask_bool, axis=1) / \ + np.sum(pred_max_not_unk_mask_bool, axis=1) + masked_means[np.isnan(masked_means)] = 0 + + masked_means_ver = masked_means[indices_ver] + #print(masked_means_ver, 'pred_max_not_unk') + + indices_where_flipped_conf_value_is_higher = \ + np.where(masked_means_flipped > masked_means_ver)[0] + + #print(indices_where_flipped_conf_value_is_higher, 'indices_where_flipped_conf_value_is_higher') + if len(indices_where_flipped_conf_value_is_higher)>0: + indices_to_be_replaced = indices_ver[indices_where_flipped_conf_value_is_higher] + preds[indices_to_be_replaced,:,:] = \ + preds_flipped[indices_where_flipped_conf_value_is_higher, :, :] + if dir_in_bin is not None: + preds_bin = self.model_zoo.get('ocr').predict(imgs_bin, verbose=0) + + if len(indices_ver)>0: + preds_flipped = self.model_zoo.get('ocr').predict(imgs_bin_ver_flipped, verbose=0) preds_max_fliped = np.max(preds_flipped, axis=2 ) preds_max_args_flipped = np.argmax(preds_flipped, axis=2 ) pred_max_not_unk_mask_bool_flipped = preds_max_args_flipped[:,:]!=self.end_character @@ -789,212 +774,179 @@ class Eynollah_ocr: #print(indices_where_flipped_conf_value_is_higher, 'indices_where_flipped_conf_value_is_higher') if len(indices_where_flipped_conf_value_is_higher)>0: indices_to_be_replaced = indices_ver[indices_where_flipped_conf_value_is_higher] - preds[indices_to_be_replaced,:,:] = \ + preds_bin[indices_to_be_replaced,:,:] = \ preds_flipped[indices_where_flipped_conf_value_is_higher, :, :] - if dir_in_bin is not None: - preds_bin = self.model_zoo.get('ocr').predict(imgs_bin, verbose=0) - - if len(indices_ver)>0: - preds_flipped = self.model_zoo.get('ocr').predict(imgs_bin_ver_flipped, verbose=0) - preds_max_fliped = np.max(preds_flipped, axis=2 ) - preds_max_args_flipped = np.argmax(preds_flipped, axis=2 ) - pred_max_not_unk_mask_bool_flipped = preds_max_args_flipped[:,:]!=self.end_character - masked_means_flipped = \ - np.sum(preds_max_fliped * pred_max_not_unk_mask_bool_flipped, axis=1) / \ - np.sum(pred_max_not_unk_mask_bool_flipped, axis=1) - masked_means_flipped[np.isnan(masked_means_flipped)] = 0 - - preds_max = np.max(preds, axis=2 ) - preds_max_args = np.argmax(preds, axis=2 ) - pred_max_not_unk_mask_bool = preds_max_args[:,:]!=self.end_character - - masked_means = \ - np.sum(preds_max * pred_max_not_unk_mask_bool, axis=1) / \ - np.sum(pred_max_not_unk_mask_bool, axis=1) - masked_means[np.isnan(masked_means)] = 0 - - masked_means_ver = masked_means[indices_ver] - #print(masked_means_ver, 'pred_max_not_unk') - - indices_where_flipped_conf_value_is_higher = \ - np.where(masked_means_flipped > masked_means_ver)[0] - - #print(indices_where_flipped_conf_value_is_higher, 'indices_where_flipped_conf_value_is_higher') - if len(indices_where_flipped_conf_value_is_higher)>0: - indices_to_be_replaced = indices_ver[indices_where_flipped_conf_value_is_higher] - preds_bin[indices_to_be_replaced,:,:] = \ - preds_flipped[indices_where_flipped_conf_value_is_higher, :, :] - - preds = (preds + preds_bin) / 2. - - pred_texts = decode_batch_predictions(preds, self.model_zoo.get('num_to_char')) - preds_max = np.max(preds, axis=2 ) - preds_max_args = np.argmax(preds, axis=2 ) - pred_max_not_unk_mask_bool = preds_max_args[:,:]!=self.end_character - masked_means = \ - np.sum(preds_max * pred_max_not_unk_mask_bool, axis=1) / \ - np.sum(pred_max_not_unk_mask_bool, axis=1) + preds = (preds + preds_bin) / 2. - for ib in range(imgs.shape[0]): - pred_texts_ib = pred_texts[ib].replace("[UNK]", "") - if masked_means[ib] >= self.min_conf_value_of_textline_text: - extracted_texts.append(pred_texts_ib) - extracted_conf_value.append(masked_means[ib]) - else: - extracted_texts.append("") - extracted_conf_value.append(0) - del cropped_lines - if dir_in_bin is not None: - del cropped_lines_bin - gc.collect() + pred_texts = decode_batch_predictions(preds, self.model_zoo.get('num_to_char')) - extracted_texts_merged = [extracted_texts[ind] - if cropped_lines_meging_indexing[ind]==0 - else extracted_texts[ind]+" "+extracted_texts[ind+1] - if cropped_lines_meging_indexing[ind]==1 - else None - for ind in range(len(cropped_lines_meging_indexing))] - - extracted_conf_value_merged = [extracted_conf_value[ind] - if cropped_lines_meging_indexing[ind]==0 - else (extracted_conf_value[ind]+extracted_conf_value[ind+1])/2. - if cropped_lines_meging_indexing[ind]==1 - else None - for ind in range(len(cropped_lines_meging_indexing))] + preds_max = np.max(preds, axis=2 ) + preds_max_args = np.argmax(preds, axis=2 ) + pred_max_not_unk_mask_bool = preds_max_args[:,:]!=self.end_character + masked_means = \ + np.sum(preds_max * pred_max_not_unk_mask_bool, axis=1) / \ + np.sum(pred_max_not_unk_mask_bool, axis=1) - extracted_conf_value_merged = [extracted_conf_value_merged[ind_cfm] - for ind_cfm in range(len(extracted_texts_merged)) - if extracted_texts_merged[ind_cfm] is not None] - extracted_texts_merged = [ind for ind in extracted_texts_merged if ind is not None] - unique_cropped_lines_region_indexer = np.unique(cropped_lines_region_indexer) - - if dir_out_image_text: - #font_path = "Charis-7.000/Charis-Regular.ttf" # Make sure this file exists! - font = importlib_resources.files(__package__) / "Charis-Regular.ttf" - with importlib_resources.as_file(font) as font: - font = ImageFont.truetype(font=font, size=40) - - for indexer_text, bb_ind in enumerate(total_bb_coordinates): - x_bb = bb_ind[0] - y_bb = bb_ind[1] - w_bb = bb_ind[2] - h_bb = bb_ind[3] - - font = fit_text_single_line(draw, extracted_texts_merged[indexer_text], - font.path, w_bb, int(h_bb*0.4) ) - - ##draw.rectangle([x_bb, y_bb, x_bb + w_bb, y_bb + h_bb], outline="red", width=2) - - text_bbox = draw.textbbox((0, 0), extracted_texts_merged[indexer_text], font=font) - text_width = text_bbox[2] - text_bbox[0] - text_height = text_bbox[3] - text_bbox[1] - - text_x = x_bb + (w_bb - text_width) // 2 # Center horizontally - text_y = y_bb + (h_bb - text_height) // 2 # Center vertically - - # Draw the text - draw.text((text_x, text_y), extracted_texts_merged[indexer_text], fill="black", font=font) - image_text.save(out_image_with_text) - - text_by_textregion = [] - for ind in unique_cropped_lines_region_indexer: - ind = np.array(cropped_lines_region_indexer)==ind - extracted_texts_merged_un = np.array(extracted_texts_merged)[ind] - if len(extracted_texts_merged_un)>1: - text_by_textregion_ind = "" - next_glue = "" - for indt in range(len(extracted_texts_merged_un)): - if (extracted_texts_merged_un[indt].endswith('⸗') or - extracted_texts_merged_un[indt].endswith('-') or - extracted_texts_merged_un[indt].endswith('¬')): - text_by_textregion_ind += next_glue + extracted_texts_merged_un[indt][:-1] - next_glue = "" - else: - text_by_textregion_ind += next_glue + extracted_texts_merged_un[indt] - next_glue = " " - text_by_textregion.append(text_by_textregion_ind) + for ib in range(imgs.shape[0]): + pred_texts_ib = pred_texts[ib].replace("[UNK]", "") + if masked_means[ib] >= self.min_conf_value_of_textline_text: + extracted_texts.append(pred_texts_ib) + extracted_conf_value.append(masked_means[ib]) else: - text_by_textregion.append(" ".join(extracted_texts_merged_un)) - #print(text_by_textregion, 'text_by_textregiontext_by_textregiontext_by_textregiontext_by_textregiontext_by_textregion') + extracted_texts.append("") + extracted_conf_value.append(0) + del cropped_lines + if dir_in_bin is not None: + del cropped_lines_bin + gc.collect() + + extracted_texts_merged = [extracted_texts[ind] + if cropped_lines_meging_indexing[ind]==0 + else extracted_texts[ind]+" "+extracted_texts[ind+1] + if cropped_lines_meging_indexing[ind]==1 + else None + for ind in range(len(cropped_lines_meging_indexing))] + + extracted_conf_value_merged = [extracted_conf_value[ind] + if cropped_lines_meging_indexing[ind]==0 + else (extracted_conf_value[ind]+extracted_conf_value[ind+1])/2. + if cropped_lines_meging_indexing[ind]==1 + else None + for ind in range(len(cropped_lines_meging_indexing))] - ###index_tot_regions = [] - ###tot_region_ref = [] + extracted_conf_value_merged = [extracted_conf_value_merged[ind_cfm] + for ind_cfm in range(len(extracted_texts_merged)) + if extracted_texts_merged[ind_cfm] is not None] + extracted_texts_merged = [ind for ind in extracted_texts_merged if ind is not None] + unique_cropped_lines_region_indexer = np.unique(cropped_lines_region_indexer) + + if dir_out_image_text: + #font_path = "Charis-7.000/Charis-Regular.ttf" # Make sure this file exists! + font = importlib_resources.files(__package__) / "Charis-Regular.ttf" + with importlib_resources.as_file(font) as font: + font = ImageFont.truetype(font=font, size=40) + + for indexer_text, bb_ind in enumerate(total_bb_coordinates): + x_bb = bb_ind[0] + y_bb = bb_ind[1] + w_bb = bb_ind[2] + h_bb = bb_ind[3] + + font = fit_text_single_line(draw, extracted_texts_merged[indexer_text], + font.path, w_bb, int(h_bb*0.4) ) + + ##draw.rectangle([x_bb, y_bb, x_bb + w_bb, y_bb + h_bb], outline="red", width=2) + + text_bbox = draw.textbbox((0, 0), extracted_texts_merged[indexer_text], font=font) + text_width = text_bbox[2] - text_bbox[0] + text_height = text_bbox[3] - text_bbox[1] - ###for jj in root1.iter(link+'RegionRefIndexed'): - ###index_tot_regions.append(jj.attrib['index']) - ###tot_region_ref.append(jj.attrib['regionRef']) - - ###id_to_order = {tid: ro for tid, ro in zip(tot_region_ref, index_tot_regions)} - - #id_textregions = [] - #textregions_by_existing_ids = [] - indexer = 0 - indexer_textregion = 0 - for nn in root1.iter(region_tags): - #id_textregion = nn.attrib['id'] - #id_textregions.append(id_textregion) - #textregions_by_existing_ids.append(text_by_textregion[indexer_textregion]) - - is_textregion_text = False - for childtest in nn: - if childtest.tag.endswith("TextEquiv"): - is_textregion_text = True - - if not is_textregion_text: - text_subelement_textregion = ET.SubElement(nn, 'TextEquiv') - unicode_textregion = ET.SubElement(text_subelement_textregion, 'Unicode') + text_x = x_bb + (w_bb - text_width) // 2 # Center horizontally + text_y = y_bb + (h_bb - text_height) // 2 # Center vertically - - has_textline = False - for child_textregion in nn: - if child_textregion.tag.endswith("TextLine"): - - is_textline_text = False - for childtest2 in child_textregion: - if childtest2.tag.endswith("TextEquiv"): - is_textline_text = True - - - if not is_textline_text: - text_subelement = ET.SubElement(child_textregion, 'TextEquiv') - text_subelement.set('conf', f"{extracted_conf_value_merged[indexer]:.2f}") - unicode_textline = ET.SubElement(text_subelement, 'Unicode') - unicode_textline.text = extracted_texts_merged[indexer] - else: - for childtest3 in child_textregion: - if childtest3.tag.endswith("TextEquiv"): - for child_uc in childtest3: - if child_uc.tag.endswith("Unicode"): - childtest3.set('conf', - f"{extracted_conf_value_merged[indexer]:.2f}") - child_uc.text = extracted_texts_merged[indexer] - - indexer = indexer + 1 - has_textline = True - if has_textline: - if is_textregion_text: - for child4 in nn: - if child4.tag.endswith("TextEquiv"): - for childtr_uc in child4: - if childtr_uc.tag.endswith("Unicode"): - childtr_uc.text = text_by_textregion[indexer_textregion] + # Draw the text + draw.text((text_x, text_y), extracted_texts_merged[indexer_text], fill="black", font=font) + image_text.save(out_image_with_text) + + text_by_textregion = [] + for ind in unique_cropped_lines_region_indexer: + ind = np.array(cropped_lines_region_indexer)==ind + extracted_texts_merged_un = np.array(extracted_texts_merged)[ind] + if len(extracted_texts_merged_un)>1: + text_by_textregion_ind = "" + next_glue = "" + for indt in range(len(extracted_texts_merged_un)): + if (extracted_texts_merged_un[indt].endswith('⸗') or + extracted_texts_merged_un[indt].endswith('-') or + extracted_texts_merged_un[indt].endswith('¬')): + text_by_textregion_ind += next_glue + extracted_texts_merged_un[indt][:-1] + next_glue = "" else: - unicode_textregion.text = text_by_textregion[indexer_textregion] - indexer_textregion = indexer_textregion + 1 + text_by_textregion_ind += next_glue + extracted_texts_merged_un[indt] + next_glue = " " + text_by_textregion.append(text_by_textregion_ind) + else: + text_by_textregion.append(" ".join(extracted_texts_merged_un)) + #print(text_by_textregion, 'text_by_textregiontext_by_textregiontext_by_textregiontext_by_textregiontext_by_textregion') + + ###index_tot_regions = [] + ###tot_region_ref = [] + + ###for jj in root1.iter(link+'RegionRefIndexed'): + ###index_tot_regions.append(jj.attrib['index']) + ###tot_region_ref.append(jj.attrib['regionRef']) + + ###id_to_order = {tid: ro for tid, ro in zip(tot_region_ref, index_tot_regions)} + + #id_textregions = [] + #textregions_by_existing_ids = [] + indexer = 0 + indexer_textregion = 0 + for nn in root1.iter(region_tags): + #id_textregion = nn.attrib['id'] + #id_textregions.append(id_textregion) + #textregions_by_existing_ids.append(text_by_textregion[indexer_textregion]) + + is_textregion_text = False + for childtest in nn: + if childtest.tag.endswith("TextEquiv"): + is_textregion_text = True + + if not is_textregion_text: + text_subelement_textregion = ET.SubElement(nn, 'TextEquiv') + unicode_textregion = ET.SubElement(text_subelement_textregion, 'Unicode') + + + has_textline = False + for child_textregion in nn: + if child_textregion.tag.endswith("TextLine"): - ###sample_order = [(id_to_order[tid], text) - ### for tid, text in zip(id_textregions, textregions_by_existing_ids) - ### if tid in id_to_order] - - ##ordered_texts_sample = [text for _, text in sorted(sample_order)] - ##tot_page_text = ' '.join(ordered_texts_sample) - - ##for page_element in root1.iter(link+'Page'): - ##text_page = ET.SubElement(page_element, 'TextEquiv') - ##unicode_textpage = ET.SubElement(text_page, 'Unicode') - ##unicode_textpage.text = tot_page_text - - ET.register_namespace("",name_space) - tree1.write(out_file_ocr,xml_declaration=True,method='xml',encoding="utf-8",default_namespace=None) - #print("Job done in %.1fs", time.time() - t0) + is_textline_text = False + for childtest2 in child_textregion: + if childtest2.tag.endswith("TextEquiv"): + is_textline_text = True + + + if not is_textline_text: + text_subelement = ET.SubElement(child_textregion, 'TextEquiv') + text_subelement.set('conf', f"{extracted_conf_value_merged[indexer]:.2f}") + unicode_textline = ET.SubElement(text_subelement, 'Unicode') + unicode_textline.text = extracted_texts_merged[indexer] + else: + for childtest3 in child_textregion: + if childtest3.tag.endswith("TextEquiv"): + for child_uc in childtest3: + if child_uc.tag.endswith("Unicode"): + childtest3.set('conf', + f"{extracted_conf_value_merged[indexer]:.2f}") + child_uc.text = extracted_texts_merged[indexer] + + indexer = indexer + 1 + has_textline = True + if has_textline: + if is_textregion_text: + for child4 in nn: + if child4.tag.endswith("TextEquiv"): + for childtr_uc in child4: + if childtr_uc.tag.endswith("Unicode"): + childtr_uc.text = text_by_textregion[indexer_textregion] + else: + unicode_textregion.text = text_by_textregion[indexer_textregion] + indexer_textregion = indexer_textregion + 1 + + ###sample_order = [(id_to_order[tid], text) + ### for tid, text in zip(id_textregions, textregions_by_existing_ids) + ### if tid in id_to_order] + + ##ordered_texts_sample = [text for _, text in sorted(sample_order)] + ##tot_page_text = ' '.join(ordered_texts_sample) + + ##for page_element in root1.iter(link+'Page'): + ##text_page = ET.SubElement(page_element, 'TextEquiv') + ##unicode_textpage = ET.SubElement(text_page, 'Unicode') + ##unicode_textpage.text = tot_page_text + + ET.register_namespace("",name_space) + tree1.write(out_file_ocr,xml_declaration=True,method='xml',encoding="utf-8",default_namespace=None) + #print("Job done in %.1fs", time.time() - t0) diff --git a/src/eynollah/training/cli.py b/src/eynollah/training/cli.py index 8ab754d..65a7a8a 100644 --- a/src/eynollah/training/cli.py +++ b/src/eynollah/training/cli.py @@ -8,6 +8,7 @@ from .build_model_load_pretrained_weights_and_save import build_model_load_pretr from .generate_gt_for_training import main as generate_gt_cli from .inference import main as inference_cli from .train import ex +from .extract_line_gt import linegt_cli @click.command(context_settings=dict( ignore_unknown_options=True, @@ -24,3 +25,4 @@ main.add_command(build_model_load_pretrained_weights_and_save) main.add_command(generate_gt_cli, 'generate-gt') main.add_command(inference_cli, 'inference') main.add_command(train_cli, 'train') +main.add_command(linegt_cli, 'export_textline_images_and_text') diff --git a/src/eynollah/training/extract_line_gt.py b/src/eynollah/training/extract_line_gt.py new file mode 100644 index 0000000..bda21e7 --- /dev/null +++ b/src/eynollah/training/extract_line_gt.py @@ -0,0 +1,136 @@ +from logging import Logger, getLogger +from typing import Optional +from pathlib import Path +import os + +import click +import cv2 +import xml.etree.ElementTree as ET +import numpy as np + +from ..utils import is_image_filename + +@click.command() +@click.option( + "--image", + "-i", + help="input image filename", + type=click.Path(exists=True, dir_okay=False), +) +@click.option( + "--dir_in", + "-di", + 'image_filename', + help="directory of input images (instead of --image)", + type=click.Path(exists=True, file_okay=False), +) +@click.option( + "--dir_xmls", + "-dx", + help="directory of input PAGE-XML files (in addition to --dir_in; filename stems must match the image files, with '.xml' suffix).", + type=click.Path(exists=True, file_okay=False), + required=True, +) +@click.option( + "--out", + "-o", + 'dir_out', + help="directory for output PAGE-XML files", + type=click.Path(exists=True, file_okay=False), + required=True, +) +@click.option( + "--dataset_abbrevation", + "-ds_pref", + 'pref_of_dataset', + help="in the case of extracting textline and text from a xml GT file user can add an abbrevation of dataset name to generated dataset", +) +@click.option( + "--do_not_mask_with_textline_contour", + "-nmtc/-mtc", + is_flag=True, + help="if this parameter set to true, cropped textline images will not be masked with textline contour.", +) +def linegt_cli( + image_filename, + dir_in, + dir_xmls, + dir_out, + pref_of_dataset, + do_not_mask_with_textline_contour, +): + assert bool(dir_in) ^ bool(image_filename), "Set --dir-in or --image-filename, not both" + if dir_in: + ls_imgs = [ + os.path.join(dir_in, image_filename) for image_filename in filter(is_image_filename, os.listdir(dir_in)) + ] + else: + assert image_filename + ls_imgs = [image_filename] + + for dir_img in ls_imgs: + file_name = Path(dir_img).stem + dir_xml = os.path.join(dir_xmls, file_name + '.xml') + + img = cv2.imread(dir_img) + + total_bb_coordinates = [] + + tree1 = ET.parse(dir_xml, parser=ET.XMLParser(encoding="utf-8")) + root1 = tree1.getroot() + alltags = [elem.tag for elem in root1.iter()] + + name_space = alltags[0].split('}')[0] + name_space = name_space.split('{')[1] + + region_tags = [x for x in alltags if x.endswith('TextRegion')][0] + + cropped_lines_region_indexer = [] + + indexer_text_region = 0 + indexer_textlines = 0 + # FIXME: non recursive, use OCR-D PAGE generateDS API. Or use an existing tool for this purpose altogether + for nn in root1.iter(region_tags): + for child_textregion in nn: + if child_textregion.tag.endswith("TextLine"): + for child_textlines in child_textregion: + if child_textlines.tag.endswith("Coords"): + cropped_lines_region_indexer.append(indexer_text_region) + p_h = child_textlines.attrib['points'].split(' ') + textline_coords = np.array([[int(x.split(',')[0]), int(x.split(',')[1])] for x in p_h]) + + x, y, w, h = cv2.boundingRect(textline_coords) + + total_bb_coordinates.append([x, y, w, h]) + + img_poly_on_img = np.copy(img) + + mask_poly = np.zeros(img.shape) + mask_poly = cv2.fillPoly(mask_poly, pts=[textline_coords], color=(1, 1, 1)) + + mask_poly = mask_poly[y : y + h, x : x + w, :] + img_crop = img_poly_on_img[y : y + h, x : x + w, :] + + if not do_not_mask_with_textline_contour: + img_crop[mask_poly == 0] = 255 + + if img_crop.shape[0] == 0 or img_crop.shape[1] == 0: + continue + + if child_textlines.tag.endswith("TextEquiv"): + for cheild_text in child_textlines: + if cheild_text.tag.endswith("Unicode"): + textline_text = cheild_text.text + if textline_text: + base_name = os.path.join( + dir_out, file_name + '_line_' + str(indexer_textlines) + ) + if pref_of_dataset: + base_name += '_' + pref_of_dataset + if not do_not_mask_with_textline_contour: + base_name += '_masked' + + with open(base_name + '.txt', 'w') as text_file: + text_file.write(textline_text) + cv2.imwrite(base_name + '.png', img_crop) + indexer_textlines += 1