Nested Gradient Tape in function (TF2.0)

Question

I try to implement MAML. Therefore I need a copy of my model (model_copy) to be trained one step, then I need my meta_model to be trained with the loss of my model_copy.

I would like to do the training of the model_copy in a function. If I copy my code to the function I don't get proper gradients_meta (they will be all none).

It seems, that the graphs are unconnected - how can I connect the graphs?

Any idea of what I am doing wrong? I watch a lot of variables, but that doesn't seem to make a difference..

Here is the code to reproduce this issue:

import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.backend as keras_backend


def copy_model(model):
    copied_model = keras.Sequential()
    copied_model.add(keras.layers.Dense(5, input_shape=(1,)))
    copied_model.add(keras.layers.Dense(1))
    copied_model.set_weights(model.get_weights())
    return copied_model


def compute_loss(model, x, y):
    logits = model(x)  # prediction of my model
    mse = keras_backend.mean(keras.losses.mean_squared_error(y, logits))  # compute loss between prediciton and label/truth
    return mse, logits


# meta_model to learn in outer gradient tape
meta_model = keras.Sequential()
meta_model.add(keras.layers.Dense(5, input_shape=(1,)))
meta_model.add(keras.layers.Dense(1))

# optimizer for training
optimizer = keras.optimizers.Adam()


# function to calculate model_copys params
def do_calc(x, y, meta_model):
    with tf.GradientTape() as gg:
        model_copy = copy_model(meta_model)
        gg.watch(x)
        gg.watch(meta_model.trainable_variables)
        gg.watch(model_copy.trainable_variables)
        loss, _ = compute_loss(model_copy, x, y)
        gradient = gg.gradient(loss, model_copy.trainable_variables)
        optimizer.apply_gradients(zip(gradient, model_copy.trainable_variables))
        return model_copy


# inputs for training
x = tf.constant(3.0, shape=(1, 1, 1))
y = tf.constant(3.0, shape=(1, 1, 1))

with tf.GradientTape() as g:

    g.watch(x)
    g.watch(y)

    model_copy = do_calc(x, y, meta_model)
    g.watch(model_copy.trainable_variables)
    # calculate loss of model_copy
    test_loss, _ = compute_loss(model_copy, x, y)
    # build gradients for meta_model update
    gradients_meta = g.gradient(test_loss, meta_model.trainable_variables)
    # gradients always None !?!!11 elf
    optimizer.apply_gradients(zip(gradients_meta, meta_model.trainable_variables))

Thank you in advance for any help.

Answer 1

I found a solution: I needed to "connect" meta-model and model-copy somehow.

Can anybody explain why this works and how I would achieve that using a "proper" optimizer?

import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.backend as keras_backend


def copy_model(model):
    copied_model = keras.Sequential()
    copied_model.add(keras.layers.Dense(5, input_shape=(1,)))
    copied_model.add(keras.layers.Dense(1))
    copied_model.set_weights(model.get_weights())
    return copied_model


def compute_loss(model, x, y):
    logits = model(x)  # prediction of my model
    mse = keras_backend.mean(keras.losses.mean_squared_error(y, logits))  # compute loss between prediciton and label/truth
    return mse, logits


# meta_model to learn in outer gradient tape
meta_model = keras.Sequential()
meta_model.add(keras.layers.Dense(5, input_shape=(1,)))
meta_model.add(keras.layers.Dense(1))

# optimizer for training
optimizer = keras.optimizers.Adam()


# function to calculate model_copys params
def do_calc(meta_model, x, y, gg, alpha=0.01):
    model_copy = copy_model(meta_model)
    loss, _ = compute_loss(model_copy, x, y)
    gradients = gg.gradient(loss, model_copy.trainable_variables)
    k = 0
    for layer in range(len(model_copy.layers)):
        # calculate adapted parameters w/ gradient descent
        # \theta_i' = \theta - \alpha * gradients
        model_copy.layers[layer].kernel = tf.subtract(meta_model.layers[layer].kernel,
                                                      tf.multiply(alpha, gradients[k]))
        model_copy.layers[layer].bias = tf.subtract(meta_model.layers[layer].bias,
                                                    tf.multiply(alpha, gradients[k + 1]))
        k += 2
    return model_copy


with tf.GradientTape() as g:
    # inputs for training
    x = tf.constant(3.0, shape=(1, 1, 1))
    y = tf.constant(3.0, shape=(1, 1, 1))
    adapted_models = []

    # model_copy = meta_model
    with tf.GradientTape() as gg:
        model_copy = do_calc(meta_model, x, y, gg)

    # calculate loss of model_copy
    test_loss, _ = compute_loss(model_copy, x, y)
    # build gradients for meta_model update
    gradients_meta = g.gradient(test_loss, meta_model.trainable_variables)
    # gradients work. Why???
    optimizer.apply_gradients(zip(gradients_meta, meta_model.trainable_variables))

Answer 2

Converting Tensor to numpy and using set_weights() will only copy the updated parameter values of the gradient, but the node name in the tf2 graph has changed, so it is not possible to directly use the loss of the copy model to find the gradient of the meta model

Nested Gradient Tape in function (TF2.0)

Question

2 answers

solution1
3 2019-11-15 10:27:40

solution2
0 2021-12-16 07:07:44

Nested Gradient Tape in function (TF2.0)

Question

2 answers

solution1 3 2019-11-15 10:27:40

solution2 0 2021-12-16 07:07:44

solution1
3 2019-11-15 10:27:40

solution2
0 2021-12-16 07:07:44