From f03124f747db7edef03d968e1b10db0e7638850d Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Thu, 5 Feb 2026 11:58:50 +0100
Subject: [PATCH] =?UTF-8?q?training.train:=20simplify+fix=20classification?=
 =?UTF-8?q?=20data=20loaders=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- unify `generate_data_from_folder_training` w/ `..._evaluation`
- instead of recreating array after every batch, just zero out
- cast image results to uint8 instead of uint16
- cast categorical results to float instead of int
---
 src/eynollah/training/train.py | 15 ++++---
 src/eynollah/training/utils.py | 78 ++++++++--------------------------
 2 files changed, 25 insertions(+), 68 deletions(-)

diff --git a/src/eynollah/training/train.py b/src/eynollah/training/train.py
index effc920..0f8d0e9 100644
--- a/src/eynollah/training/train.py
+++ b/src/eynollah/training/train.py
@@ -430,13 +430,13 @@ def run(_config,
                       metrics=['accuracy', F1Score(average='macro', name='f1')])
 
         list_classes = list(classification_classes_name.values())
-        trainXY = generate_data_from_folder_training(
-            dir_train, n_batch, input_height, input_width, n_classes, list_classes)
-        testXY = generate_data_from_folder_evaluation(
-            dir_eval, input_height, input_width, n_classes, list_classes)
+        trainXY = generate_data_from_folder(
+            dir_train, n_batch, input_height, input_width, n_classes, list_classes, shuffle=True)
+        testXY = generate_data_from_folder(
+            dir_eval, n_batch, input_height, input_width, n_classes, list_classes)
+        epoch_size_train = return_number_of_total_training_data(dir_train)
+        epoch_size_eval = return_number_of_total_training_data(dir_eval)
 
-        y_tot = np.zeros((testX.shape[0], n_classes))
-        num_rows = return_number_of_total_training_data(dir_train)
         callbacks = [TensorBoard(os.path.join(dir_output, 'logs'), write_graph=False),
                      SaveWeightsAfterSteps(0, dir_output, _config,
                                            monitor='val_f1',
@@ -444,9 +444,10 @@ def run(_config,
                                            mode='max')]
         
         history = model.fit(trainXY,
-                            steps_per_epoch=num_rows / n_batch,
+                            steps_per_epoch=epoch_size_train // n_batch,
                             #class_weight=weights)
                             validation_data=testXY,
+                            validation_steps=epoch_size_eval // n_batch,
                             verbose=1,
                             epochs=n_epochs,
                             callbacks=callbacks,
diff --git a/src/eynollah/training/utils.py b/src/eynollah/training/utils.py
index 61b2536..5b25a4f 100644
--- a/src/eynollah/training/utils.py
+++ b/src/eynollah/training/utils.py
@@ -166,50 +166,7 @@ def return_number_of_total_training_data(path_classes):
         
     
     
-def generate_data_from_folder_evaluation(path_classes, height, width, n_classes, list_classes):
-    #sub_classes = os.listdir(path_classes)
-    #n_classes = len(sub_classes)
-    all_imgs = []
-    labels = []
-    #dicts =dict()
-    #indexer= 0
-    for indexer, sub_c in enumerate(list_classes):
-        sub_files =  os.listdir(os.path.join(path_classes,sub_c  )) 
-        sub_files = [os.path.join(path_classes,sub_c  )+'/' + x for x in sub_files]
-        #print(     os.listdir(os.path.join(path_classes,sub_c  ))     )
-        all_imgs = all_imgs + sub_files
-        sub_labels = list( np.zeros( len(sub_files) ) +indexer )
-
-        #print( len(sub_labels) )
-        labels = labels + sub_labels
-        #dicts[sub_c] = indexer
-        #indexer +=1 
-        
-
-    categories =  to_categorical(range(n_classes)).astype(np.int16)#[  [1 , 0, 0 , 0 , 0 , 0]  , [0 , 1, 0 , 0 , 0 , 0]  , [0 , 0, 1 , 0 , 0 , 0] , [0 , 0, 0 , 1 , 0 , 0] , [0 , 0, 0 , 0 , 1 , 0]  , [0 , 0, 0 , 0 , 0 , 1] ]
-    ret_x= np.zeros((len(labels), height,width, 3)).astype(np.int16)
-    ret_y= np.zeros((len(labels), n_classes)).astype(np.int16)
-    
-    #print(all_imgs)
-    for i in range(len(all_imgs)):
-        row = all_imgs[i]
-        #####img = cv2.imread(row, 0)
-        #####img= resize_image (img, height, width)
-        #####img = img.astype(np.uint16)
-        #####ret_x[i, :,:,0] = img[:,:]
-        #####ret_x[i, :,:,1] = img[:,:]
-        #####ret_x[i, :,:,2] = img[:,:]
-        
-        img = cv2.imread(row)
-        img= resize_image (img, height, width)
-        img = img.astype(np.uint16)
-        ret_x[i, :,:] = img[:,:,:]
-        
-        ret_y[i, :] =  categories[ int( labels[i] ) ][:]
-    
-    return ret_x/255., ret_y
-
-def generate_data_from_folder_training(path_classes, batchsize, height, width, n_classes, list_classes):
+def generate_data_from_folder(path_classes, batchsize, height, width, n_classes, list_classes, shuffle=False):
     #sub_classes = os.listdir(path_classes)
     #n_classes = len(sub_classes)
 
@@ -228,43 +185,42 @@ def generate_data_from_folder_training(path_classes, batchsize, height, width, n
         labels = labels + sub_labels
         #dicts[sub_c] = indexer
         #indexer +=1 
-        
-    ids = np.array(range(len(labels)))
-    random.shuffle(ids)
-    
-    shuffled_labels = np.array(labels)[ids]
-    shuffled_files = np.array(all_imgs)[ids]
+
+    if shuffle:
+        ids = np.array(range(len(labels)))
+        random.shuffle(ids)
+        labels = np.array(labels)[ids]
+        all_imgs = np.array(all_imgs)[ids]
+
     categories = to_categorical(range(n_classes)).astype(np.int16)#[  [1 , 0, 0 , 0 , 0 , 0]  , [0 , 1, 0 , 0 , 0 , 0]  , [0 , 0, 1 , 0 , 0 , 0] , [0 , 0, 0 , 1 , 0 , 0] , [0 , 0, 0 , 0 , 1 , 0]  , [0 , 0, 0 , 0 , 0 , 1] ]
-    ret_x= np.zeros((batchsize, height,width, 3)).astype(np.int16)
-    ret_y= np.zeros((batchsize, n_classes)).astype(np.int16)
+    ret_x= np.zeros((batchsize, height,width, 3)).astype(np.uint8)
+    ret_y= np.zeros((batchsize, n_classes)).astype(float)
     batchcount = 0
     while True:
-        for i in range(len(shuffled_files)):
-            row = shuffled_files[i]
-            #print(row)
-            ###img = cv2.imread(row, 0)
+        for lab, img in zip(labels, all_imgs):
+            ###img = cv2.imread(img, 0)
             ###img= resize_image (img, height, width)
             ###img = img.astype(np.uint16)
             ###ret_x[batchcount, :,:,0] = img[:,:]
             ###ret_x[batchcount, :,:,1] = img[:,:]
             ###ret_x[batchcount, :,:,2] = img[:,:]
             
-            img = cv2.imread(row)
+            img = cv2.imread(img)
             img= resize_image (img, height, width)
             img = img.astype(np.uint16)
             ret_x[batchcount, :,:,:] = img[:,:,:]
             
             #print(int(shuffled_labels[i]) )
             #print( categories[int(shuffled_labels[i])] )
-            ret_y[batchcount, :] =  categories[ int( shuffled_labels[i] ) ][:]
+            ret_y[batchcount, :] =  categories[int(lab)][:]
             
             batchcount+=1
             
             if batchcount>=batchsize:
-                ret_x = ret_x/255.
+                ret_x = ret_x//255
                 yield ret_x, ret_y
-                ret_x= np.zeros((batchsize, height,width, 3)).astype(np.int16)
-                ret_y= np.zeros((batchsize, n_classes)).astype(np.int16)
+                ret_x[:] = 0
+                ret_y[:] = 0
                 batchcount = 0
 
 def do_brightening(img, factor):