What is the behavior of tf.keras sub-models in training, saving and loading?

Question

Let us say we have two models:

def build_badger():

  badger_input = keras.Input(shape=(100, 100), name='badger_input')
  x = badger_input

  # start misc layers..
  x = keras.layers.Conv2D(..)(x)

  ..

  # end misc layers
  z = keras.layers.Dense(10, name='to_mushroom')(x)
  y = keras.layers.Dense(1, activation='sigmoid', name='badger_prediction')(x)

  return keras.Model(name='badger',
                     inputs={'badger_input': badger_input},
                     ouputs={'badger_prediction': y, 'to_mushroom': z})

and

def build_mushroom():

  mushroom_input = keras.Input(shape=(10, 100, 100), name='mushroom_input')
  badger = build_badger()

  badger_result = [badger(mushroom_input[:, n], training=True)['to_mushroom'] for n in range(10)]
  x = keras.layers.Concatenate()(badger_result)

  # start misc layers..
  x = keras.layers.Reshape(..)(x)

  ..

  # end misc layers ..
  y = keras.layers.Dense(1, activation='sigmoid', name='mushroom_prediction')(x)

  return keras.Model(name='mushroom',
                     inputs={'mushroom_input': badger_input},
                     ouputs={'mushroom_prediction': y})

mushroom = build_mushroom()
badger = mushroom.get_layer('badger')

As you can see, mushroom works on several outputs from badger. But we would also like to optimize badger independently. So the training looks something like this:

def training_loop(n):

    for _ in range(n): 
        badger.compile(..)
        badger.fit(epochs=10, ..)

        mushroom.compile(..)
        mushroom.fit(epochs=1, )

        mushroom.save(checkpoint_path)


def load_trained_models():
    mushroom = keras.models.load_model(checkpoint_path)
    badger = mushroom.get_layer('badger')

So what we'd like to do is to be able to train some, save checkpoints and resume training at a later stage

# train models from scratch:
training_loop(10)

# load checkpoint:
load_trained_models()

# resume training loaded models:
training_loop(10)

One things seems unclear to me: After the initial model creation, there will only be one badger model? Or will the mushroom model create its own copy?

The intension is to have only one badger model, and train it interchangeably as a sub-model of mushroom and as a stand-alone model.

I think this is the case, however it is a bit unclear from the tests I've done - the training seems to work, but the data is quite noisy so I cant really tell if I'm just training two models independently.

Here, the training order was: First badger 10 epochs, then mushroom 1 epoch, and repeat..

Answer 1

I can confirm that it does work as intended above. The following code shows that if you have a sub-model ("model in a model", if the term is unclear) then you can train the sub-model separately or as part of the larger model:

def build_imsimple():

  def act(z):
      return tf.tanh(z)

  x_train = layers.Input(shape=(28, 28), name='x_train')
  x = layers.LayerNormalization(axis=[1, 2])(x_train)
  x = layers.Reshape((28, 28, 1))(x)

  x = layers.Conv2D(filters=64, kernel_size=(7, 7), padding='valid')(x)
  x = layers.SpatialDropout2D(0.25)(x)
  x = act(x)

  x = layers.Conv2D(filters=64, kernel_size=(3, 3), strides=(2, 2), padding='valid')(x)
  x = layers.SpatialDropout2D(0.5)(x)
  x = act(x)

  x = layers.LocallyConnected2D(filters=64, kernel_size=(3, 3), strides=(2, 2), padding='valid')(x)
  x = layers.SpatialDropout2D(0.5)(x)
  x = act(x)

  x = layers.Flatten()(x)
  x = layers.Dense(256)(x)
  x = layers.Dropout(0.5)(x)
  x = act(x)

  y_train = layers.Dense(10, name='y_train')(x)

  return tf.keras.Model(inputs=[x_train], outputs=[y_train], name='imsimple')


def build_composite_dummy():

  x_train = layers.Input(shape=(28, 28), name='x_dummy_train')

  imsimple = build_imsimple()
  x = imsimple(x_train)
  y = layers.Dense(10,
                   name='y_dummy_train',
                   kernel_initializer=tf.keras.initializers.Identity(),
                   use_bias=False)(x)

  return tf.keras.Model(inputs=[x_train], outputs=[y], name='Dummy')


def test_dummy():

  (mnist_x_train, mnist_y_train), (mnist_x_test, mnist_y_test) = tf.keras.datasets.mnist.load_data()

  dummy = build_composite_dummy()

  dummy_pred = dummy(mnist_x_test)
  dummy_acc = tf.reduce_mean(tf.keras.metrics.SparseCategoricalAccuracy()(y_true=mnist_y_test, y_pred=dummy_pred))
  print(f"dummy pre train acc: {float(dummy_acc)}")

  imsimple = dummy.get_layer('imsimple')

  imsimple.compile(optimizer=tf.keras.optimizers.SGD(),
                 loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                 metrics=tf.keras.metrics.SparseCategoricalAccuracy())

  imsimple.fit(x=mnist_x_train,
             y=mnist_y_train,
             batch_size=64,
             steps_per_epoch=200,
             epochs=10,
             validation_data=(mnist_x_test, mnist_y_test))

  dummy_pred = dummy(mnist_x_test)
  dummy_acc = tf.reduce_mean(tf.keras.metrics.SparseCategoricalAccuracy()(y_true=mnist_y_test, y_pred=dummy_pred))
  print(f"dummy post imsimple train acc: {float(dummy_acc)}")

  dummy_noise = tf.random.normal(shape=(mnist_y_train.shape[0], 10), mean=0.0, stddev=1.0, dtype='float32')

  dummy.compile(optimizer=tf.keras.optimizers.SGD(),
                loss=tf.keras.losses.MeanSquaredError())

  dummy.fit(x=mnist_x_train,
            y=dummy_noise,
            batch_size=64,
            steps_per_epoch=200,
            epochs=10,
            validation_data=(mnist_x_test, mnist_y_test))

  imsimple_pred = imsimple(mnist_x_test)
  imsimple_acc = tf.reduce_mean(tf.keras.metrics.SparseCategoricalAccuracy()(y_true=mnist_y_test, y_pred=imsimple_pred))
  print(f"imsimple post dummy train acc: {float(imsimple_acc)}")

Basically, the sub-model here is trained to classify the mnist dataset, while the parent model tries to predict noise. Trained in this order, the sub-model will achieve about 90% accuracy, but drops to 15% after the parent model is trained to predict noise.

What is the behavior of tf.keras sub-models in training, saving and loading?

Question

1 answers

solution1
0 2021-03-04 19:59:06

What is the behavior of tf.keras sub-models in training, saving and loading?

Question

1 answers

solution1 0 2021-03-04 19:59:06

solution1
0 2021-03-04 19:59:06