使用 tensorflow-gpu 1.14 和 tf.distribute.MirroredStrategy() 的自定義訓練循環導致 ValueError

[英]Custom training loop using tensorflow-gpu 1.14 and tf.distribute.MirroredStrategy() results in ValueError

我正在嘗試使用tf.distribute.MirroredStrategy()在多個 GPU 上運行自定義訓練循環。 雖然訓練循環在單個 GPU 上完美運行,但ValueError: 'handle' is not available outside the replica context or a 'tf.distribute.Strategy.update()' call當我嘗試使用多個 GPU 時拋出ValueError: 'handle' is not available outside the replica context or a 'tf.distribute.Strategy.update()' call 我正在使用 tensorflow 1.14 和 Python 3.7.3。

我在下面嘗試了一個最小的例子。 自定義訓練循環在單個 GPU 上運行沒有問題,但我嘗試將tf.distribute.MirroredStrategy()用於多個 GPU 失敗並顯示錯誤消息(完整輸出)

ValueError                                Traceback (most recent call last)
<ipython-input-11-3fda5d330457> in <module>
      1 with mirrored_strategy.scope():
----> 2     model, train_op, X1_in, X2_in = create_model_and_train_op()
      3     with tf.Session() as sess:
      4         sess.run(tf.global_variables_initializer())
      5         for sample_ind in range(n_samples):

<ipython-input-7-8f5b3971bbe2> in create_model_and_train_op()
      7     model = Model(name='BNN',inputs=[X1_in,X2_in], outputs=[loss])
----> 8     train_op = tf.train.AdamOptimizer().minimize(loss)
     10     return model, train_op, X1_in, X2_in

~/.local/share/virtualenvs/keras_bnn_lv-NP1oBJBi/lib/python3.7/site-packages/tensorflow/python/training/optimizer.py in minimize(self, loss, global_step, var_list, gate_gradients, aggregation_method, colocate_gradients_with_ops, name, grad_loss)
    401         aggregation_method=aggregation_method,
    402         colocate_gradients_with_ops=colocate_gradients_with_ops,
--> 403         grad_loss=grad_loss)
    405     vars_with_grad = [v for g, v in grads_and_vars if g is not None]

~/.local/share/virtualenvs/keras_bnn_lv-NP1oBJBi/lib/python3.7/site-packages/tensorflow/python/training/optimizer.py in compute_gradients(self, loss, var_list, gate_gradients, aggregation_method, colocate_gradients_with_ops, grad_loss)
    510         gate_gradients=(gate_gradients == Optimizer.GATE_OP),
    511         aggregation_method=aggregation_method,
--> 512         colocate_gradients_with_ops=colocate_gradients_with_ops)
    513     if gate_gradients == Optimizer.GATE_GRAPH:
    514       grads = control_flow_ops.tuple(grads)

~/.local/share/virtualenvs/keras_bnn_lv-NP1oBJBi/lib/python3.7/site-packages/tensorflow/python/ops/gradients_impl.py in gradients(ys, xs, grad_ys, name, colocate_gradients_with_ops, gate_gradients, aggregation_method, stop_gradients, unconnected_gradients)
    156         ys, xs, grad_ys, name, colocate_gradients_with_ops,
    157         gate_gradients, aggregation_method, stop_gradients,
--> 158         unconnected_gradients)
    159   # pylint: enable=protected-access

~/.local/share/virtualenvs/keras_bnn_lv-NP1oBJBi/lib/python3.7/site-packages/tensorflow/python/ops/gradients_util.py in _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops, gate_gradients, aggregation_method, stop_gradients, unconnected_gradients, src_graph)
    595     xs = [
    596         x.handle if resource_variable_ops.is_resource_variable(x) else x
--> 597         for x in xs
    598     ]
    599     xs = ops.internal_convert_n_to_tensor_or_indexed_slices(

~/.local/share/virtualenvs/keras_bnn_lv-NP1oBJBi/lib/python3.7/site-packages/tensorflow/python/ops/gradients_util.py in <listcomp>(.0)
    595     xs = [
    596         x.handle if resource_variable_ops.is_resource_variable(x) else x
--> 597         for x in xs
    598     ]
    599     xs = ops.internal_convert_n_to_tensor_or_indexed_slices(

~/.local/share/virtualenvs/keras_bnn_lv-NP1oBJBi/lib/python3.7/site-packages/tensorflow/python/distribute/values.py in handle(self)
    641       device = distribute_lib.get_update_device()
    642       if device is None:
--> 643         raise ValueError("`handle` is not available outside the replica context"
    644                          " or a `tf.distribute.Strategy.update()` call.")
    645     return self.get(device=device).handle

ValueError: `handle` is not available outside the replica context or a `tf.distribute.Strategy.update()` call.

Google 建議的唯一修復是更新到 tensorflow 2.0.0 - 測試版。 我想知道是否有辦法在 1.14 中解決這個問題。


import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Concatenate
from tensorflow.keras.models import Model

import sys
print (sys.version)

input_dim = 42
n_samples = 10000

x1_data = np.random.rand(n_samples,input_dim)
x2_data = np.random.rand(n_samples,input_dim)

def create_model_and_train_op():
    X1_in = Input(shape=(input_dim,))
    X2_in = Input(shape=(input_dim,))
    XY = Concatenate(axis=-1)([X1_in,X2_in])
    loss = Dense(1)(XY)
    model = Model(name='BNN',inputs=[X1_in,X2_in], outputs=[loss])

    # Error message is thrown in the following line if using MirroredStrategy()
    train_op = tf.train.AdamOptimizer().minimize(loss)

    return model, train_op, X1_in, X2_in

##### Single GPU: Runs without problems
model, train_op, X1_in, X2_in = create_model_and_train_op()
with tf.Session() as sess:
    for sample_ind in range(n_samples):
        sess.run(train_op, feed_dict = {X1_in : x1_data[sample_ind].reshape(1,input_dim) , X2_in : x2_data[sample_ind].reshape(1,input_dim) })

##### Multiple GPU: Results in error message
mirrored_strategy = tf.distribute.MirroredStrategy()
print('Number of devices: {}'.format(mirrored_strategy.num_replicas_in_sync))

with mirrored_strategy.scope():
    model, train_op, X1_in, X2_in = create_model_and_train_op()
    with tf.Session() as sess:
        for sample_ind in range(n_samples):
            sess.run(train_op, feed_dict = {X1_in : x1_data[sample_ind].reshape(1,input_dim) , X2_in : x2_data[sample_ind].reshape(1,input_dim) })

我僅使用權重負載解決了這個問題。 這是 Keras手冊中關於多 GPU 訓練的修改示例。

import tensorflow as tf
from tensorflow import keras
import os 
from tensorflow.python.keras.backend import set_session

def get_compiled_model():
    # Make a simple 2-layer densely-connected neural network.

    global sess
    global graph
    sess = tf.Session()
    graph = tf.get_default_graph()
    # a special trick from here https://github.com/tensorflow/tensorflow/issues/28287

    # IMPORTANT: models have to be loaded AFTER SETTING THE SESSION for keras! 
    # Otherwise, their weights will be unavailable in the threads after the session there has been set
    inputs = keras.Input(shape=(784,))
    x = keras.layers.Dense(256, activation="relu")(inputs)
    x = keras.layers.Dense(256, activation="relu")(x)
    outputs = keras.layers.Dense(10)(x)
    model = keras.Model(inputs, outputs)
    checkpoints = [checkpoint_dir + "/" + name for name in os.listdir(checkpoint_dir)]
    if checkpoints:
        latest_checkpoint = checkpoints[-1]
        latest_checkpoint = checkpoint_dir + "/" + os.path.splitext(os.path.split(checkpoints[-1])[-1])[0]
        print("Restoring from", latest_checkpoint)
    return model

def get_dataset():
    batch_size = 32
    num_val_samples = 10000

    # Return the MNIST dataset in the form of a `tf.data.Dataset`.
    (x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()

    # Preprocess the data (these are Numpy arrays)
    x_train = x_train.reshape(-1, 784).astype("float32") / 255
    x_test = x_test.reshape(-1, 784).astype("float32") / 255
    y_train = y_train.astype("float32")
    y_test = y_test.astype("float32")

    # Reserve num_val_samples samples for validation
    x_val = x_train[-num_val_samples:]
    y_val = y_train[-num_val_samples:]
    x_train = x_train[:-num_val_samples]
    y_train = y_train[:-num_val_samples]
    return (
        tf.data.Dataset.from_tensor_slices((x_train, y_train)).batch(batch_size),
        tf.data.Dataset.from_tensor_slices((x_val, y_val)).batch(batch_size),
        tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(batch_size),

# Train the model on all available devices.
train_dataset, val_dataset, test_dataset = get_dataset()

# Prepare a directory to store all the checkpoints.
checkpoint_dir = "./ckpt"
if not os.path.exists(checkpoint_dir):

def make_or_restore_model():
    # Either restore the latest model, or create a fresh one
    # if there is no checkpoint available.
    print("Creating a new model")
    return get_compiled_model()

def run_training(epochs=1):
    # Create a MirroredStrategy.
    strategy = tf.distribute.MirroredStrategy()

    print("Number of devices: {}".format(strategy.num_replicas_in_sync))
    # Open a strategy scope and create/restore the model
    with strategy.scope():
        model = make_or_restore_model()

    callbacks = [
        # This callback saves a SavedModel every epoch
        # We include the current epoch in the folder name.
            filepath=checkpoint_dir + "/ckpt-{epoch}.cpkt", save_freq="epoch"
    with graph.as_default():
    return model

# Running the first time creates the model
model = run_training(epochs=2)

# Test the model on all available devices.



