RuntimeError：方法需要在跨副本上下文中，使用 get_replica_context().merge_call()

Question

我嘗試將正在運行的 CycleGAN 從 SingleGPU 修改為 tf.distribute.MirroredStrategy。 嘗試了幾種方法，例如自定義訓練循環、 jongsung park的問題、Tensorflow 教程之后的調整以及 strategy.scope() 的幾個地方。 但是我仍然收到以下錯誤。

Ausnahme: RuntimeError
in user code:

    File "C:\Users\Einka\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\engine\training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "w:\300_Neural_Network\320_Unsupervised_GAN_CycleConsistency\CycleGAN_Custom_Trainingloop", line 490, in train_step  *
        G_loss, F_loss, F_X_loss, D_Y_loss = strategy.run(self.train_step_single, args=(self, batch_data))

    RuntimeError: Method requires being in cross-replica context, use get_replica_context().merge_call()
  File "C:\Users\Einka\AppData\Local\Temp\__autograph_generated_file9_hoxhkq.py", line 10, in tf__train_step
    (G_loss, F_loss, F_X_loss, D_Y_loss) = ag__.converted_call(ag__.ld(strategy).run, (ag__.ld(self).train_step_single,), dict(args=(ag__.ld(self), ag__.ld(batch_data))), fscope)

During handling of the above exception, another exception occurred:

  File "C:\Users\Einka\AppData\Local\Temp\__autograph_generated_file9_hoxhkq.py", line 10, in tf__train_step
    (G_loss, F_loss, F_X_loss, D_Y_loss) = ag__.converted_call(ag__.ld(strategy).run, (ag__.ld(self).train_step_single,), dict(args=(ag__.ld(self), ag__.ld(batch_data))), fscope)

During handling of the above exception, another exception occurred:

  File "C:\Users\Einka\AppData\Local\Temp\__autograph_generated_filejk9kpr6g.py", line 15, in tf__train_function
    retval_ = ag__.converted_call(ag__.ld(step_function), (ag__.ld(self), ag__.ld(iterator)), None, fscope)
  File "C:\Users\Einka\AppData\Local\Temp\__autograph_generated_file9_hoxhkq.py", line 10, in tf__train_step
    (G_loss, F_loss, F_X_loss, D_Y_loss) = ag__.converted_call(ag__.ld(strategy).run, (ag__.ld(self).train_step_single,), dict(args=(ag__.ld(self), ag__.ld(batch_data))), fscope)

During handling of the above exception, another exception occurred:

  File "C:\Users\Einka\AppData\Local\Temp\__autograph_generated_file9_hoxhkq.py", line 10, in tf__train_step
    (G_loss, F_loss, F_X_loss, D_Y_loss) = ag__.converted_call(ag__.ld(strategy).run, (ag__.ld(self).train_step_single,), dict(args=(ag__.ld(self), ag__.ld(batch_data))), fscope)
  File "C:\Users\Einka\AppData\Local\Temp\__autograph_generated_filejk9kpr6g.py", line 15, in tf__train_function
    retval_ = ag__.converted_call(ag__.ld(step_function), (ag__.ld(self), ag__.ld(iterator)), None, fscope)
  File "W:\300_Neural_Network\320_Unsupervised_GAN_CycleConsistency\CycleGAN_Custom_Trainingloop", line 575, in <module>
    cycle_gan_model.fit(

原來的 model 是來自 keras.io 的CycleGan 。 下面發布的代碼也可以在Colab上找到


import random
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import tensorflow_addons as tfa
import tensorflow_datasets as tfds

#CONFIGURATION
EPOCHS = 1
BATCH_SIZE_PER_REPLICA = 1
BUFFER_SIZE = 256

#Tensorboard
logdir = "W:/300_Neural_Network/320_Unsupervised_GAN_CycleConsistency/logs/" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir)
file_writer = tf.summary.create_file_writer(logdir + "/metrics")
file_writer.set_as_default()
# ----

# Distribution strategy
strategy = tf.distribute.MirroredStrategy()
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.compat.v1.Session(config=config)
tf.compat.v1.keras.backend.set_session(session)

# Visual parameters
tfds.disable_progress_bar()
# autotune = tf.data.AUTOTUNE

# Load the horse-zebra dataset using tensorflow-datasets.
dataset, _ = tfds.load("cycle_gan/horse2zebra", with_info=True, as_supervised=True)
train_horses, train_zebras = dataset["trainA"], dataset["trainB"]
test_horses, test_zebras = dataset["testA"], dataset["testB"]

# Define the standard image size.
orig_img_size = (286, 286)
# Size of the random crops to be used during training.
input_img_size = (256, 256, 3)
# Weights initializer for the layers.
kernel_init = keras.initializers.RandomNormal(mean=0.0, stddev=0.02, seed=random.seed(random.random()))
# Gamma initializer for instance normalization.
gamma_init = keras.initializers.RandomNormal(mean=0.0, stddev=0.02, seed=random.seed(random.random()))

batch_size = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync


def normalize_img(img):
    img = tf.cast(img, dtype=tf.float32)
    # Map values in the range [-1, 1]
    return (img / 127.5) - 1.0


def preprocess_train_image(img, label):
    # Random flip
    img = tf.image.random_flip_left_right(img)
    # Resize to the original size first
    img = tf.image.resize(img, [*orig_img_size])
    # Random crop to 256X256
    img = tf.image.random_crop(img, size=[*input_img_size])
    # Normalize the pixel values in the range [-1, 1]
    img = normalize_img(img)
    return img


def preprocess_test_image(img, label):
    # Only resizing and normalization for the test images.
    img = tf.image.resize(img, [input_img_size[0], input_img_size[1]])
    img = normalize_img(img)
    return img

def distribute_datasets(strategy, train_batches, test_batches):
    
    ### START CODE HERE ###
    train_dist_dataset = strategy.experimental_distribute_dataset(train_batches)
    test_dist_dataset = strategy.experimental_distribute_dataset(test_batches)
    ### END CODE HERE ###
    
    return train_dist_dataset,  test_dist_dataset

# Apply the preprocessing operations to the training data
train_horses = (
    train_horses.map(preprocess_train_image)
    .cache()
    .shuffle(BUFFER_SIZE)
    .batch(batch_size)
    .prefetch(1)
)

train_zebras = (
    train_zebras.map(preprocess_train_image)
    .cache()
    .shuffle(BUFFER_SIZE)
    .batch(batch_size)
    .prefetch(1)
)

# Apply the preprocessing operations to the test data
test_horses = (
    test_horses.map(preprocess_test_image)
    .cache()
    .shuffle(BUFFER_SIZE)
    .batch(batch_size)
)
test_zebras = (
    test_zebras.map(preprocess_test_image)
    .cache()
    .shuffle(BUFFER_SIZE)
    .batch(batch_size)
)

train_set, test_set = distribute_datasets(strategy, tf.data.Dataset.zip((train_horses, train_zebras)), tf.data.Dataset.zip((test_horses, test_zebras)))

class ReflectionPadding2D(layers.Layer):
    """Implements Reflection Padding as a layer.

    Args:
        padding(tuple): Amount of padding for the
        spatial dimensions.

    Returns:
        A padded tensor with the same type as the input tensor.
    """

    def __init__(self, padding=(1, 1), **kwargs):
        self.padding = tuple(padding)
        super(ReflectionPadding2D, self).__init__(**kwargs)

    def call(self, input_tensor, mask=None):
        padding_width, padding_height = self.padding
        padding_tensor = [
            [0, 0],
            [padding_height, padding_height],
            [padding_width, padding_width],
            [0, 0],
        ]
        return tf.pad(input_tensor, padding_tensor, mode="REFLECT")


def residual_block(
    x,
    activation,
    kernel_initializer=kernel_init,
    kernel_size=(3, 3),
    strides=(1, 1),
    padding="valid",
    gamma_initializer=gamma_init,
    use_bias=False,
):
    dim = x.shape[-1]
    input_tensor = x

    x = ReflectionPadding2D()(input_tensor)
    x = layers.Conv2D(
        dim,
        kernel_size,
        strides=strides,
        kernel_initializer=kernel_initializer,
        padding=padding,
        use_bias=use_bias,
    )(x)
    x = tfa.layers.InstanceNormalization(gamma_initializer=gamma_initializer)(x)
    x = activation(x)

    x = ReflectionPadding2D()(x)
    x = layers.Conv2D(
        dim,
        kernel_size,
        strides=strides,
        kernel_initializer=kernel_initializer,
        padding=padding,
        use_bias=use_bias,
    )(x)
    x = tfa.layers.InstanceNormalization(gamma_initializer=gamma_initializer)(x)
    x = layers.add([input_tensor, x])
    return x


def downsample(
    x,
    filters,
    activation,
    kernel_initializer=kernel_init,
    kernel_size=(3, 3),
    strides=(2, 2),
    padding="same",
    gamma_initializer=gamma_init,
    use_bias=False,
):
    x = layers.Conv2D(
        filters,
        kernel_size,
        strides=strides,
        kernel_initializer=kernel_initializer,
        padding=padding,
        use_bias=use_bias,
    )(x)
    x = tfa.layers.InstanceNormalization(gamma_initializer=gamma_initializer)(x)
    if activation:
        x = activation(x)
    return x


def upsample(
    x,
    filters,
    activation,
    kernel_size=(3, 3),
    strides=(2, 2),
    padding="same",
    kernel_initializer=kernel_init,
    gamma_initializer=gamma_init,
    use_bias=False,
):
    x = layers.Conv2DTranspose(
        filters,
        kernel_size,
        strides=strides,
        padding=padding,
        kernel_initializer=kernel_initializer,
        use_bias=use_bias,
    )(x)
    x = tfa.layers.InstanceNormalization(gamma_initializer=gamma_initializer)(x)
    if activation:
        x = activation(x)
    return x

def get_resnet_generator(
    filters=64,
    num_downsampling_blocks=2,
    num_residual_blocks=9,
    num_upsample_blocks=2,
    gamma_initializer=gamma_init,
    name=None,
):
    img_input = layers.Input(shape=input_img_size, name=name + "_img_input")
    x = ReflectionPadding2D(padding=(3, 3))(img_input)
    x = layers.Conv2D(filters, (7, 7), kernel_initializer=kernel_init, use_bias=False)(
        x
    )
    x = tfa.layers.InstanceNormalization(gamma_initializer=gamma_initializer)(x)
    x = layers.Activation("relu")(x)

    # Downsampling
    for _ in range(num_downsampling_blocks):
        filters *= 2
        x = downsample(x, filters=filters, activation=layers.Activation("relu"))

    # Residual blocks
    for _ in range(num_residual_blocks):
        x = residual_block(x, activation=layers.Activation("relu"))

    # Upsampling
    for _ in range(num_upsample_blocks):
        filters //= 2
        x = upsample(x, filters, activation=layers.Activation("relu"))

    # Final block
    x = ReflectionPadding2D(padding=(3, 3))(x)
    x = layers.Conv2D(3, (7, 7), padding="valid")(x)
    x = layers.Activation("tanh")(x)

    model = keras.models.Model(img_input, x, name=name)
    return model


def get_discriminator(
    filters=64, kernel_initializer=kernel_init, num_downsampling=3, name=None
):
    img_input = layers.Input(shape=input_img_size, name=name + "_img_input")
    x = layers.Conv2D(
        filters,
        (4, 4),
        strides=(2, 2),
        padding="same",
        kernel_initializer=kernel_initializer,
    )(img_input)
    x = layers.LeakyReLU(0.2)(x)

    num_filters = filters
    for num_downsample_block in range(3):
        num_filters *= 2
        if num_downsample_block < 2:
            x = downsample(
                x,
                filters=num_filters,
                activation=layers.LeakyReLU(0.2),
                kernel_size=(4, 4),
                strides=(2, 2),
            )
        else:
            x = downsample(
                x,
                filters=num_filters,
                activation=layers.LeakyReLU(0.2),
                kernel_size=(4, 4),
                strides=(1, 1),
            )

    x = layers.Conv2D(
        1, (4, 4), strides=(1, 1), padding="same", kernel_initializer=kernel_initializer
    )(x)

    model = keras.models.Model(inputs=img_input, outputs=x, name=name)
    return model


# Get the generators
gen_G = get_resnet_generator(name="generator_G")
gen_F = get_resnet_generator(name="generator_F")

# Get the discriminators
disc_X = get_discriminator(name="discriminator_X")
disc_Y = get_discriminator(name="discriminator_Y")

class CycleGan(keras.Model):
    def __init__(
        self,
        generator_G,
        generator_F,
        discriminator_X,
        discriminator_Y,
        lambda_cycle=10.0,
        lambda_identity=0.5,
    ):
        super(CycleGan, self).__init__()
        self.gen_G = generator_G
        self.gen_F = generator_F
        self.disc_X = discriminator_X
        self.disc_Y = discriminator_Y
        self.lambda_cycle = lambda_cycle
        self.lambda_identity = lambda_identity

    def compile(
        self,
        gen_G_optimizer,
        gen_F_optimizer,
        disc_X_optimizer,
        disc_Y_optimizer,
        gen_loss_fn,
        disc_loss_fn,
        cycle_loss_fn,
        identity_loss_fn,
    ):
        super(CycleGan, self).compile()
        self.gen_G_optimizer = gen_G_optimizer
        self.gen_F_optimizer = gen_F_optimizer
        self.disc_X_optimizer = disc_X_optimizer
        self.disc_Y_optimizer = disc_Y_optimizer
        self.generator_loss_fn = gen_loss_fn
        self.discriminator_loss_fn = disc_loss_fn
        self.cycle_loss_fn = cycle_loss_fn 
        self.identity_loss_fn = identity_loss_fn

    def __call__ ( self, batch_data ):
        real_x, real_y = batch_data
        genG = self.gen_G
        genF = self.gen_F 

        return genG(real_x), genF(real_y)

    def call ( self, batch_data ):
        real_x, real_y = batch_data
        genG = self.gen_G
        genF = self.gen_F 

        return genG(real_x), genF(real_y)    
    def compute_output_shape(input_shape=(None, 256, 256, 3)):
        return input_shape

    def train_step_single(self, batch_data):
        # with strategy.scope():
        # x is Horse and y is zebra
        real_x, real_y = batch_data

        with tf.GradientTape(persistent=True) as tape:
            # Horse to fake zebra
            fake_y = self.gen_G(real_x, training=True)
            # Zebra to fake horse -> y2x
            fake_x = self.gen_F(real_y, training=True)

            # Cycle (Horse to fake zebra to fake horse): x -> y -> x
            cycled_x = self.gen_F(fake_y, training=True)
            # Cycle (Zebra to fake horse to fake zebra) y -> x -> y
            cycled_y = self.gen_G(fake_x, training=True)

            # Identity mapping
            same_x = self.gen_F(real_x, training=True)
            same_y = self.gen_G(real_y, training=True)

            # Discriminator output
            disc_real_x = self.disc_X(real_x, training=True)
            disc_fake_x = self.disc_X(fake_x, training=True)

            disc_real_y = self.disc_Y(real_y, training=True)
            disc_fake_y = self.disc_Y(fake_y, training=True)

            # Generator adverserial loss
            gen_G_loss = self.generator_loss_fn(disc_fake_y)
            gen_F_loss = self.generator_loss_fn(disc_fake_x)

            # Generator cycle loss
            cycle_loss_G = self.cycle_loss_fn(real_y, cycled_y) * self.lambda_cycle
            cycle_loss_F = self.cycle_loss_fn(real_x, cycled_x) * self.lambda_cycle

            # Generator identity loss
            id_loss_G = (
                self.identity_loss_fn(real_y, same_y)
                * self.lambda_cycle
                * self.lambda_identity
            )
            id_loss_F = (
                self.identity_loss_fn(real_x, same_x)
                * self.lambda_cycle
                * self.lambda_identity
            )

            # Total generator loss
            total_loss_G = gen_G_loss + cycle_loss_G + id_loss_G
            total_loss_F = gen_F_loss + cycle_loss_F + id_loss_F

            # Discriminator loss
            disc_X_loss = self.discriminator_loss_fn(disc_real_x, disc_fake_x)
            disc_Y_loss = self.discriminator_loss_fn(disc_real_y, disc_fake_y)

        # Get the gradients for the generators
        grads_G = tape.gradient(total_loss_G, self.gen_G.trainable_variables)
        grads_F = tape.gradient(total_loss_F, self.gen_F.trainable_variables)

        # Get the gradients for the discriminators
        disc_X_grads = tape.gradient(disc_X_loss, self.disc_X.trainable_variables)
        disc_Y_grads = tape.gradient(disc_Y_loss, self.disc_Y.trainable_variables)

        # Update the weights of the generators
        self.gen_G_optimizer.apply_gradients(
            zip(grads_G, self.gen_G.trainable_variables)
        )
        self.gen_F_optimizer.apply_gradients(
            zip(grads_F, self.gen_F.trainable_variables)
        )

        # Update the weights of the discriminators
        self.disc_X_optimizer.apply_gradients(
            zip(disc_X_grads, self.disc_X.trainable_variables)
        )
        self.disc_Y_optimizer.apply_gradients(
            zip(disc_Y_grads, self.disc_Y.trainable_variables)
        )

        return {
            "G_loss": total_loss_G,
            "F_loss": total_loss_F,
            "D_X_loss": disc_X_loss,
            "D_Y_loss": disc_Y_loss,
        }

    @tf.function
    def train_step(self, batch_data):
        G_loss, F_loss, F_X_loss, D_Y_loss = strategy.run(self.train_step_single, args=(self, batch_data))
        return G_loss, F_loss, F_X_loss, D_Y_loss 

assert tf.distribute.get_replica_context() is not None  # default

class GANMonitor(keras.callbacks.Callback):
    """A callback to generate and save images after each epoch"""

    def __init__(self, num_img=4):
        self.num_img = num_img

    def on_epoch_end(self, epoch, logs=None):
        _, ax = plt.subplots(4, 2, figsize=(12, 12))
        for i, img in enumerate(test_horses.take(self.num_img)):
            prediction = self.model.gen_G(img)[0].numpy()
            prediction = (prediction * 127.5 + 127.5).astype(np.uint8)
            img = (img[0] * 127.5 + 127.5).numpy().astype(np.uint8)

            ax[i, 0].imshow(img)
            ax[i, 1].imshow(prediction)
            ax[i, 0].set_title("Input image")
            ax[i, 1].set_title("Translated image")
            ax[i, 0].axis("off")
            ax[i, 1].axis("off")

            prediction = keras.preprocessing.image.array_to_img(prediction)
            prediction.save(
                "W:/300_Neural_Network/320_Unsupervised_GAN_CycleConsistency/plots/generated_img_{i}_{epoch}.png".format(i=i, epoch=epoch + 1)
            )
        plt.show()
        plt.close()


# Loss function for evaluating adversarial loss
with strategy.scope():
    adv_loss_fn = keras.losses.MeanSquaredError(reduction=tf.keras.losses.Reduction.SUM)

    # Define the loss function for the generators
    def generator_loss_fn(fake):
        fake_loss = adv_loss_fn(tf.ones_like(fake), fake)
        return fake_loss


    # Define the loss function for the discriminators
    def discriminator_loss_fn(real, fake):
        real_loss = adv_loss_fn(tf.ones_like(real), real)
        fake_loss = adv_loss_fn(tf.zeros_like(fake), fake)
        return (real_loss + fake_loss) * 0.5


# Create cycle gan model
with strategy.scope():
    cycle_gan_model = CycleGan(
        generator_G=gen_G, generator_F=gen_F, discriminator_X=disc_X, discriminator_Y=disc_Y
    )

# Compile the model

with strategy.scope():
    cycle_gan_model.compile(
        gen_G_optimizer=keras.optimizers.Adam(learning_rate=2e-4, beta_1=0.5),
        gen_F_optimizer=keras.optimizers.Adam(learning_rate=2e-4, beta_1=0.5),
        disc_X_optimizer=keras.optimizers.Adam(learning_rate=2e-4, beta_1=0.5),
        disc_Y_optimizer=keras.optimizers.Adam(learning_rate=2e-4, beta_1=0.5),
        gen_loss_fn=generator_loss_fn,
        disc_loss_fn=discriminator_loss_fn,
        cycle_loss_fn=keras.losses.MeanAbsoluteError(reduction=tf.keras.losses.Reduction.SUM),
        identity_loss_fn=keras.losses.MeanAbsoluteError(reduction=tf.keras.losses.Reduction.SUM),
    )

# Callbacks
plotter = GANMonitor()
checkpoint_filepath = "W:/300_Neural_Network/320_Unsupervised_GAN_CycleConsistency/checkpoints/cyclegan_checkpoints.{epoch:03d}"
model_checkpoint_callback = keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True
)

# Here we will train the model for just one epoch as each epoch takes around
# 7 minutes on a single P100 backed machine.
cycle_gan_model.fit(
    train_set,
    epochs=1,
    steps_per_epoch = 1067,
    callbacks=[tensorboard_callback, plotter, model_checkpoint_callback],
)

Answer 1

錯誤的原因在於兩部分。

第一：生成器和鑒別器未在 strategy.scope() 中初始化

with strategy.scope():
    # Get the generators
    gen_G = get_resnet_generator(name="generator_G")
    gen_F = get_resnet_generator(name="generator_F")

    # Get the discriminators
    disc_X = get_discriminator(name="discriminator_X")
    disc_Y = get_discriminator(name="discriminator_Y")

第二：由於 model 已經在 strategy.scope() 中初始化，因此不需要 strategy.run(self.train_step_single, args=(self, batch_data))。 直接調用training_step_single就夠了。

消除

 def train_step(self, batch_data):
        G_loss, F_loss, F_X_loss, D_Y_loss = strategy.run(self.train_step_single, args=(self, batch_data))
        return G_loss, F_loss, F_X_loss, D_Y_loss

並將 train_step_single 重命名為 train_step 並且它有效。

RuntimeError：方法需要在跨副本上下文中，使用 get_replica_context().merge_call()

問題描述

1 個解決方案

解決方案1
0 2022-10-01 07:44:13

RuntimeError：方法需要在跨副本上下文中，使用 get_replica_context().merge_call()

問題描述

1 個解決方案

解決方案1 0 2022-10-01 07:44:13

解決方案1
0 2022-10-01 07:44:13