From 5d447abcc4e24cec25e228fb93f95bdd6e549e5a Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Sat, 3 May 2025 02:59:16 +0200 Subject: [PATCH] let to add dataset abbrevation to extracted textline images and text --- src/eynollah/cli.py | 17 +++++++- src/eynollah/eynollah.py | 91 ++++++++++++++++++++++++---------------- 2 files changed, 71 insertions(+), 37 deletions(-) diff --git a/src/eynollah/cli.py b/src/eynollah/cli.py index 56d5d7e..7d08ac8 100644 --- a/src/eynollah/cli.py +++ b/src/eynollah/cli.py @@ -342,7 +342,6 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_ "-m", help="directory of models", type=click.Path(exists=True, file_okay=False), - required=True, ) @click.option( "--tr_ocr", @@ -379,6 +378,11 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_ "-bs", help="number of inference batch size. Default b_s for trocr and cnn_rnn models are 2 and 8 respectively", ) +@click.option( + "--dataset_abbrevation", + "-ds_pref", + help="in the case of extracting textline and text from a xml GT file user can add an abbrevation of dataset name to generated dataset", +) @click.option( "--log_level", "-l", @@ -386,10 +390,18 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_ help="Override log level globally to this", ) -def ocr(dir_in, dir_in_bin, out, dir_xmls, dir_out_image_text, model, tr_ocr, export_textline_images_and_text, do_not_mask_with_textline_contour, draw_texts_on_image, prediction_with_both_of_rgb_and_bin, batch_size, log_level): +def ocr(dir_in, dir_in_bin, out, dir_xmls, dir_out_image_text, model, tr_ocr, export_textline_images_and_text, do_not_mask_with_textline_contour, draw_texts_on_image, prediction_with_both_of_rgb_and_bin, batch_size, dataset_abbrevation, log_level): initLogging() if log_level: getLogger('eynollah').setLevel(getLevelName(log_level)) + assert not export_textline_images_and_text or not tr_ocr, "Exporting textline and text -etit can not be set alongside transformer ocr -tr_ocr" + assert not export_textline_images_and_text or not model, "Exporting textline and text -etit can not be set alongside model -m" + assert not export_textline_images_and_text or not batch_size, "Exporting textline and text -etit can not be set alongside batch size -bs" + assert not export_textline_images_and_text or not dir_in_bin, "Exporting textline and text -etit can not be set alongside directory of bin images -dib" + assert not export_textline_images_and_text or not dir_out_image_text, "Exporting textline and text -etit can not be set alongside directory of images with predicted text -doit" + assert not export_textline_images_and_text or not draw_texts_on_image, "Exporting textline and text -etit can not be set alongside draw text on image -dtoi" + assert not export_textline_images_and_text or not prediction_with_both_of_rgb_and_bin, "Exporting textline and text -etit can not be set alongside prediction with both rgb and bin -brb" + eynollah_ocr = Eynollah_ocr( dir_xmls=dir_xmls, dir_out_image_text=dir_out_image_text, @@ -403,6 +415,7 @@ def ocr(dir_in, dir_in_bin, out, dir_xmls, dir_out_image_text, model, tr_ocr, ex draw_texts_on_image=draw_texts_on_image, prediction_with_both_of_rgb_and_bin=prediction_with_both_of_rgb_and_bin, batch_size=batch_size, + pref_of_dataset=dataset_abbrevation, ) eynollah_ocr.run() diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index cc1f766..0b15573 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -4877,6 +4877,7 @@ class Eynollah_ocr: do_not_mask_with_textline_contour=False, draw_texts_on_image=False, prediction_with_both_of_rgb_and_bin=False, + pref_of_dataset = None, logger=None, ): self.dir_in = dir_in @@ -4890,43 +4891,45 @@ class Eynollah_ocr: self.draw_texts_on_image = draw_texts_on_image self.dir_out_image_text = dir_out_image_text self.prediction_with_both_of_rgb_and_bin = prediction_with_both_of_rgb_and_bin - if tr_ocr: - self.processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed") - self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - self.model_ocr_dir = dir_models + "/trocr_model_ens_of_3_checkpoints_201124" - self.model_ocr = VisionEncoderDecoderModel.from_pretrained(self.model_ocr_dir) - self.model_ocr.to(self.device) - if not batch_size: - self.b_s = 2 + self.pref_of_dataset = pref_of_dataset + if not export_textline_images_and_text: + if tr_ocr: + self.processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed") + self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + self.model_ocr_dir = dir_models + "/trocr_model_ens_of_3_checkpoints_201124" + self.model_ocr = VisionEncoderDecoderModel.from_pretrained(self.model_ocr_dir) + self.model_ocr.to(self.device) + if not batch_size: + self.b_s = 2 + else: + self.b_s = int(batch_size) + else: - self.b_s = int(batch_size) - - else: - self.model_ocr_dir = dir_models + "/model_step_1050000_ocr"#"/model_0_ocr_cnnrnn"#"/model_23_ocr_cnnrnn" - model_ocr = load_model(self.model_ocr_dir , compile=False) - - self.prediction_model = tf.keras.models.Model( - model_ocr.get_layer(name = "image").input, - model_ocr.get_layer(name = "dense2").output) - if not batch_size: - self.b_s = 8 - else: - self.b_s = int(batch_size) - + self.model_ocr_dir = dir_models + "/model_step_1050000_ocr"#"/model_0_ocr_cnnrnn"#"/model_23_ocr_cnnrnn" + model_ocr = load_model(self.model_ocr_dir , compile=False) - with open(os.path.join(self.model_ocr_dir, "characters_org.txt"),"r") as config_file: - characters = json.load(config_file) + self.prediction_model = tf.keras.models.Model( + model_ocr.get_layer(name = "image").input, + model_ocr.get_layer(name = "dense2").output) + if not batch_size: + self.b_s = 8 + else: + self.b_s = int(batch_size) - - AUTOTUNE = tf.data.AUTOTUNE + + with open(os.path.join(self.model_ocr_dir, "characters_org.txt"),"r") as config_file: + characters = json.load(config_file) - # Mapping characters to integers. - char_to_num = StringLookup(vocabulary=list(characters), mask_token=None) + + AUTOTUNE = tf.data.AUTOTUNE - # Mapping integers back to original characters. - self.num_to_char = StringLookup( - vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True - ) + # Mapping characters to integers. + char_to_num = StringLookup(vocabulary=list(characters), mask_token=None) + + # Mapping integers back to original characters. + self.num_to_char = StringLookup( + vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True + ) def decode_batch_predictions(self, pred, max_len = 128): @@ -5365,10 +5368,28 @@ class Eynollah_ocr: if cheild_text.tag.endswith("Unicode"): textline_text = cheild_text.text if textline_text: - with open(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'.txt'), 'w') as text_file: - text_file.write(textline_text) + if self.do_not_mask_with_textline_contour: + if self.pref_of_dataset: + with open(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'_'+self.pref_of_dataset+'.txt'), 'w') as text_file: + text_file.write(textline_text) - cv2.imwrite(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'.png'), img_crop ) + cv2.imwrite(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'_'+self.pref_of_dataset+'.png'), img_crop ) + else: + with open(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'.txt'), 'w') as text_file: + text_file.write(textline_text) + + cv2.imwrite(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'.png'), img_crop ) + else: + if self.pref_of_dataset: + with open(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'_'+self.pref_of_dataset+'_masked.txt'), 'w') as text_file: + text_file.write(textline_text) + + cv2.imwrite(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'_'+self.pref_of_dataset+'_masked.png'), img_crop ) + else: + with open(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'_masked.txt'), 'w') as text_file: + text_file.write(textline_text) + + cv2.imwrite(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'_masked.png'), img_crop ) indexer_textlines+=1