From 5d447abcc4e24cec25e228fb93f95bdd6e549e5a Mon Sep 17 00:00:00 2001
From: vahidrezanezhad <vahid631983@gmail.com>
Date: Sat, 3 May 2025 02:59:16 +0200
Subject: [PATCH] let to add dataset abbrevation to extracted textline images
 and text

---
 src/eynollah/cli.py      | 17 +++++++-
 src/eynollah/eynollah.py | 91 ++++++++++++++++++++++++----------------
 2 files changed, 71 insertions(+), 37 deletions(-)

diff --git a/src/eynollah/cli.py b/src/eynollah/cli.py
index 56d5d7e..7d08ac8 100644
--- a/src/eynollah/cli.py
+++ b/src/eynollah/cli.py
@@ -342,7 +342,6 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_
     "-m",
     help="directory of models",
     type=click.Path(exists=True, file_okay=False),
-    required=True,
 )
 @click.option(
     "--tr_ocr",
@@ -379,6 +378,11 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_
     "-bs",
     help="number of inference batch size. Default b_s for trocr and cnn_rnn models are 2 and 8 respectively",
 )
+@click.option(
+    "--dataset_abbrevation",
+    "-ds_pref",
+    help="in the case of extracting textline and text from a xml GT file user can add an abbrevation of dataset name to generated dataset",
+)
 @click.option(
     "--log_level",
     "-l",
@@ -386,10 +390,18 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_
     help="Override log level globally to this",
 )
 
-def ocr(dir_in, dir_in_bin, out, dir_xmls, dir_out_image_text, model, tr_ocr, export_textline_images_and_text, do_not_mask_with_textline_contour, draw_texts_on_image, prediction_with_both_of_rgb_and_bin, batch_size, log_level):
+def ocr(dir_in, dir_in_bin, out, dir_xmls, dir_out_image_text, model, tr_ocr, export_textline_images_and_text, do_not_mask_with_textline_contour, draw_texts_on_image, prediction_with_both_of_rgb_and_bin, batch_size, dataset_abbrevation, log_level):
     initLogging()
     if log_level:
         getLogger('eynollah').setLevel(getLevelName(log_level))
+    assert not export_textline_images_and_text or not tr_ocr, "Exporting textline and text  -etit can not be set alongside transformer ocr -tr_ocr"
+    assert not export_textline_images_and_text or not model, "Exporting textline and text  -etit can not be set alongside model -m"
+    assert not export_textline_images_and_text or not batch_size, "Exporting textline and text  -etit can not be set alongside batch size -bs"
+    assert not export_textline_images_and_text or not dir_in_bin, "Exporting textline and text  -etit can not be set alongside directory of bin images -dib"
+    assert not export_textline_images_and_text or not dir_out_image_text, "Exporting textline and text  -etit can not be set alongside directory of images with predicted text -doit"
+    assert not export_textline_images_and_text or not draw_texts_on_image, "Exporting textline and text  -etit can not be set alongside draw text on image -dtoi"
+    assert not export_textline_images_and_text or not prediction_with_both_of_rgb_and_bin, "Exporting textline and text  -etit can not be set alongside prediction with both rgb and bin -brb"
+    
     eynollah_ocr = Eynollah_ocr(
         dir_xmls=dir_xmls,
         dir_out_image_text=dir_out_image_text,
@@ -403,6 +415,7 @@ def ocr(dir_in, dir_in_bin, out, dir_xmls, dir_out_image_text, model, tr_ocr, ex
         draw_texts_on_image=draw_texts_on_image,
         prediction_with_both_of_rgb_and_bin=prediction_with_both_of_rgb_and_bin,
         batch_size=batch_size,
+        pref_of_dataset=dataset_abbrevation,
     )
     eynollah_ocr.run()
 
diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py
index cc1f766..0b15573 100644
--- a/src/eynollah/eynollah.py
+++ b/src/eynollah/eynollah.py
@@ -4877,6 +4877,7 @@ class Eynollah_ocr:
         do_not_mask_with_textline_contour=False,
         draw_texts_on_image=False,
         prediction_with_both_of_rgb_and_bin=False,
+        pref_of_dataset = None,
         logger=None,
     ):
         self.dir_in = dir_in
@@ -4890,43 +4891,45 @@ class Eynollah_ocr:
         self.draw_texts_on_image = draw_texts_on_image
         self.dir_out_image_text = dir_out_image_text
         self.prediction_with_both_of_rgb_and_bin = prediction_with_both_of_rgb_and_bin
-        if tr_ocr:
-            self.processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")
-            self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-            self.model_ocr_dir = dir_models + "/trocr_model_ens_of_3_checkpoints_201124"
-            self.model_ocr = VisionEncoderDecoderModel.from_pretrained(self.model_ocr_dir)
-            self.model_ocr.to(self.device)
-            if not batch_size:
-                self.b_s = 2
+        self.pref_of_dataset = pref_of_dataset
+        if not export_textline_images_and_text:
+            if tr_ocr:
+                self.processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")
+                self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+                self.model_ocr_dir = dir_models + "/trocr_model_ens_of_3_checkpoints_201124"
+                self.model_ocr = VisionEncoderDecoderModel.from_pretrained(self.model_ocr_dir)
+                self.model_ocr.to(self.device)
+                if not batch_size:
+                    self.b_s = 2
+                else:
+                    self.b_s = int(batch_size)
+
             else:
-                self.b_s = int(batch_size)
-
-        else:
-            self.model_ocr_dir = dir_models + "/model_step_1050000_ocr"#"/model_0_ocr_cnnrnn"#"/model_23_ocr_cnnrnn"
-            model_ocr = load_model(self.model_ocr_dir , compile=False)
-            
-            self.prediction_model = tf.keras.models.Model(
-                            model_ocr.get_layer(name = "image").input, 
-                            model_ocr.get_layer(name = "dense2").output)
-            if not batch_size:
-                self.b_s = 8
-            else:
-                self.b_s = int(batch_size)
-
+                self.model_ocr_dir = dir_models + "/model_step_1050000_ocr"#"/model_0_ocr_cnnrnn"#"/model_23_ocr_cnnrnn"
+                model_ocr = load_model(self.model_ocr_dir , compile=False)
                 
-            with open(os.path.join(self.model_ocr_dir, "characters_org.txt"),"r") as config_file:
-                characters = json.load(config_file)
+                self.prediction_model = tf.keras.models.Model(
+                                model_ocr.get_layer(name = "image").input, 
+                                model_ocr.get_layer(name = "dense2").output)
+                if not batch_size:
+                    self.b_s = 8
+                else:
+                    self.b_s = int(batch_size)
 
-                
-            AUTOTUNE = tf.data.AUTOTUNE
+                    
+                with open(os.path.join(self.model_ocr_dir, "characters_org.txt"),"r") as config_file:
+                    characters = json.load(config_file)
 
-            # Mapping characters to integers.
-            char_to_num = StringLookup(vocabulary=list(characters), mask_token=None)
+                    
+                AUTOTUNE = tf.data.AUTOTUNE
 
-            # Mapping integers back to original characters.
-            self.num_to_char = StringLookup(
-                vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True
-            )
+                # Mapping characters to integers.
+                char_to_num = StringLookup(vocabulary=list(characters), mask_token=None)
+
+                # Mapping integers back to original characters.
+                self.num_to_char = StringLookup(
+                    vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True
+                )
 
         
     def decode_batch_predictions(self, pred, max_len = 128):
@@ -5365,10 +5368,28 @@ class Eynollah_ocr:
                                             if cheild_text.tag.endswith("Unicode"):
                                                 textline_text = cheild_text.text
                                                 if textline_text:
-                                                    with open(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'.txt'), 'w') as text_file:
-                                                        text_file.write(textline_text)
+                                                    if self.do_not_mask_with_textline_contour:
+                                                        if self.pref_of_dataset:
+                                                            with open(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'_'+self.pref_of_dataset+'.txt'), 'w') as text_file:
+                                                                text_file.write(textline_text)
 
-                                                    cv2.imwrite(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'.png'), img_crop )
+                                                            cv2.imwrite(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'_'+self.pref_of_dataset+'.png'), img_crop )
+                                                        else:
+                                                            with open(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'.txt'), 'w') as text_file:
+                                                                text_file.write(textline_text)
+
+                                                            cv2.imwrite(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'.png'), img_crop )
+                                                    else:
+                                                        if self.pref_of_dataset:
+                                                            with open(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'_'+self.pref_of_dataset+'_masked.txt'), 'w') as text_file:
+                                                                text_file.write(textline_text)
+
+                                                            cv2.imwrite(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'_'+self.pref_of_dataset+'_masked.png'), img_crop )
+                                                        else:
+                                                            with open(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'_masked.txt'), 'w') as text_file:
+                                                                text_file.write(textline_text)
+
+                                                            cv2.imwrite(os.path.join(self.dir_out, file_name+'_line_'+str(indexer_textlines)+'_masked.png'), img_crop )
                                                         
                                                 indexer_textlines+=1