training: plot predictions to TB logs along with training/testing

2026-07-13 23:29:15 +02:00 · 2026-02-24 17:00:48 +01:00 · 2026-02-24 17:00:48 +01:00 · 18607e0f48
commit 18607e0f48
parent 56833b3f55
1 changed files with 76 additions and 2 deletions
--- a/src/eynollah/training/train.py
+++ b/src/eynollah/training/train.py
@ -74,6 +74,79 @@ def configuration():
    except:
        print("no GPU device available", file=sys.stderr)
@tf.function
 def plot_layout_tf(in_: tf.Tensor, out:tf.Tensor) -> tf.Tensor:
    """
    Implements training.inference.SBBPredict.visualize_model_output for TF
    (effectively plotting the layout segmentation map on the input image).
    In doing so, also converts:
    - from Eynollah's BGR/float on the input side
    - to std RGB/int format on the output side
    """
    # in_: [B, H, W, 3] (BGR float)
    image = in_[..., ::-1] * 255
    # out: [B, H, W, C]
    lab = tf.math.argmax(out, axis=-1)
    # lab: [B, H, W]
    colors = tf.constant([[255, 255, 255],
                          [255, 0, 0],
                          [255, 125, 0],
                          [255, 0, 125],
                          [125, 125, 125],
                          [125, 125, 0],
                          [0, 125, 255],
                          [0, 125, 0],
                          [125, 125, 125],
                          [0, 125, 255],
                          [125, 0, 125],
                          [0, 255, 0],
                          [0, 0, 255],
                          [0, 255, 255],
                          [255, 125, 125],
                          [255, 0, 255]])
    layout = tf.gather(colors, lab)
    # layout: [B, H, W, 3]
    image = tf.cast(image, tf.float32)
    layout = tf.cast(layout, tf.float32)
    #weighted = image * 0.5 + layout * 0.1 (too dark)
    weighted = image * 0.9 + layout * 0.1
    return tf.cast(weighted, tf.uint8)
 # plot predictions on train and test set during every epoch
 class TensorBoardPlotter(TensorBoard):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.model_call = None
    def on_epoch_begin(self, epoch, logs=None):
        super().on_epoch_begin(epoch, logs=logs)
        self.model_call = self.model.call
        @tf.function
        def new_call(inputs, **kwargs):
            outputs = self.model_call(inputs, **kwargs)
            images = plot_layout_tf(inputs, outputs)
            self.plot(images, training=kwargs.get('training', None), epoch=epoch)
            return outputs
        self.model.call = new_call
    def on_epoch_end(self, epoch, logs=None):
        # re-instate (so ModelCheckpoint does not see our override call)
        self.model.call = self.model_call
        # force rebuild of tf.function (so Python binding for epoch gets re-evaluated)
        self.model.train_function = self.model.make_train_function(True)
        self.model.test_function = self.model.make_test_function(True)
        super().on_epoch_end(epoch, logs=logs)
    def plot(self, images, training=None, epoch=0):
        if training:
            writer = self._train_writer
            mode, step = "train", self._train_step.read_value()
        else:
            writer = self._val_writer
            mode, step = "test", self._val_step.read_value()
        family = "epoch_%03d" % (1 + epoch)
        with writer.as_default():
            # used to be family kwarg for tf.summary.image name prefix
            with tf.name_scope(family):
                tf.summary.image(mode, images, step=step, max_outputs=len(images))
 def get_dirs_or_files(input_data):
    image_input, labels_input = os.path.join(input_data, 'images/'), os.path.join(input_data, 'labels/')
@ -471,14 +544,15 @@ def run(_config,
                lab_gen = lab_gen.map(_to_categorical)
            return tf.data.Dataset.zip(img_gen, lab_gen).rebatch(n_batch, drop_remainder=True)
        train_gen = get_dataset(dir_flow_train_imgs, dir_flow_train_labels, shuffle=np.random.randint(1e6))
        callbacks = [TensorBoard(os.path.join(dir_output, 'logs'), write_graph=False),
                     SaveWeightsAfterSteps(0, dir_output, _config)]
        valdn_gen = get_dataset(dir_flow_eval_imgs, dir_flow_eval_labels)
        train_steps = len(os.listdir(dir_flow_train_imgs)) // n_batch
        valdn_steps = len(os.listdir(dir_flow_eval_imgs)) // n_batch
        _log.info("training on %d batches in %d epochs", train_steps, n_epochs)
        _log.info("validating on %d batches", valdn_steps)
        callbacks = [TensorBoardPlotter(os.path.join(dir_output, 'logs'), write_graph=False),
                     SaveWeightsAfterSteps(0, dir_output, _config),
        ]
        if save_interval:
            callbacks.append(SaveWeightsAfterSteps(save_interval, dir_output, _config))
        model.fit(