[英]Distributed training using MirrorStrategy in tensorflow 2.2 with custom training loop not working - getting stuck when updating gradients
I'm using tf.distribute.Strategy to train a model, based on unet, with MirrorStrategy over two (or more) gpus.我正在使用 tf.distribute.Strategy 来训练基于 unet 的 model,MirrorStrategy 超过两个(或更多)gpus。 Below is my code for the custom train loop I use for the forward and backward passes of the network.
下面是我用于网络前向和后向传递的自定义训练循环的代码。 For some reason, the logits, loss and gradients of the first batch of the first epoch are calculated but then it gets stuck at optimizer.apply_gradients(zip(gradients, model.trainable_variables). I can't for the life of me what the problem is so any help would be much appreciated.
出于某种原因,计算了第一个 epoch 的第一批的 logits、loss 和梯度,但是它卡在了 optimizer.apply_gradients(zip(gradients, model.trainable_variables)。我这辈子都做不到问题是任何帮助将不胜感激。
import os
import glob
import numpy as np
import tensorflow as tf
from tensorflow.keras.utils import Progbar
import tensorflow.keras.backend as K
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Activation, Dense, BatchNormalization, Dropout
from tensorflow.keras.layers import UpSampling2D, concatenate
from evaluation import diceCoef
tf.config.experimental_run_functions_eagerly(True)
class Train():
def __init__(self, model, lossFunc, optimizer, strategy, epochs, batchSize):
self.epochs = epochs
self.batchSize = batchSize
self.strategy = strategy
#self.lossFunc = lossFunc
self.lossFunc = tf.keras.losses.BinaryCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.NONE)
self.optimizer = optimizer
self.model = model
self.history = {'trainloss': [], 'trainmetric':[], 'valmetric': []}
def computeLoss(self, yPred, yTrue):
#loss = tf.reduce_sum(self.lossFunc(yPred, yTrue)) * (1./self.batchSize)
loss = self.lossFunc(yPred, yTrue)
loss = loss * (1. / self.strategy.num_replicas_in_sync)
#print(loss)
return loss
@tf.function
def trainStep(self, x, y, i):
#x = batch[0]
#y = batch[1]
x = tf.cast(x, tf.float32)
y = tf.cast(y, tf.float32)
#print(self.model.trainable_variables)
with tf.GradientTape() as tape:
logits = self.model(x, training=True)
logits = tf.cast(logits, tf.float32)
loss = self.computeLoss(logits, y)
#loss = self.lossFunc(logits, y)
#print('loss', loss)
gradients = tape.gradient(loss, self.model.trainable_variables)
print(len(gradients))
print(len(self.model.trainable_variables))
self.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))
return loss, logits
@tf.function
def validStep(self, x, y):
logits = self.model(x, training=False)
loss = self.lossFunc(y, logits)
return loss, logits,
@tf.function
def distributedTrainEpoch(self, dataset, trainSteps):
totalDice = 0
totalLoss = 0
#prog = Progbar(trainSteps-1)
for i, batch in enumerate(dataset):
x = batch[0]
#y = tf.expand_dims(batch[1], axis=-1)
y = batch[1]
batchLoss, logits = self.strategy.run(self.trainStep, args=(x,y,i))
print('batchloss', batchLoss)
#pred = (logits.numpy() > 0.5).astype('int16').astype(np.float16)
#batchDice = self.strategy.run(diceCoef, args=(pred, y))
totalLoss += self.strategy.reduce(tf.distribute.ReduceOp.SUM, batchLoss, axis=None)
#totalDice += self.strategy.reduce(tf.distribute.ReduceOp.SUM, batchDice, axis=None)
#prog.update(i)
return totalLoss, totalDice
@tf.function
def distributedValidEpoch(self, dataset):
totalLoss = 0
totalDice = 0
for d in dataset:
x = d[0]
y = tf.expand_dims(d[1], axis=-1)
loss, logits = self.strategy.run(self.validStep, args=(x, y))
pred = (logits.numpy() > 0.5).astype('int16').astype(np.float16)
dice = self.strategy.run(diceCoef, args=(pred, y))
totalLoss += self.strategy.reduce(tf.distribute.ReduceOp.SUM, loss, axis=None)
totalDice += self.strategy.reduce(tf.distribute.ReduceOp.SUM, dice, axis=None)
return totalLoss, totalDice
def forward(self, trainDataset, validDataset, trainSteps, validSteps):
for e in range(self.epochs):
tf.print('Epoch: {}/{}...'.format(e+1, self.epochs), end="")
trainLoss, trainDice = self.distributedTrainEpoch(trainDataset, trainSteps)
avgTrainDice = trainDice.numpy()[0] / trainSteps
avgTrainLoss = trainLoss.numpy() / trainSteps
print('train', avgTrainDice)
print('loss', avgTrainLoss)
tf.print(' Epoch: {}/{}, loss - {:.2f}, dice - {:.2f}'.format(e+1,
self.epochs, avgTrainLoss, avgTrainDice), end="")
valLoss, valDice = self.distributedValidEpoch(validDataset)
avgValidDice = valDice.numpy()[0] / validSteps
avgValidLoss = valLoss.numpy() / validSteps
self.history['trainmetric'].append(avgTrainDice)
self.history['trainloss'].append(avgTrainLoss)
self.history['valmetric'].append(avgValidDice)
self.history['valmetric'].append(avgValidLoss)
tf.print(' val_loss - {:.3f}, val_dice - {:.3f}'.format(avgValidLoss, avgValidDice))
return self.model, history
This is the part of the code from another script that sets up the strategy scope, builds the model and calls the train class.这是另一个脚本的代码部分,它设置策略 scope,构建 model 并调用火车 class。
with strategy.scope():
if model == 'fcn8':
print('Model: {}'.format(model))
with tf.device('/cpu:0'):
if api == 'functional':
fcn = FCN()
model = fcn.getFCN8()
elif api=='subclass':
model = FCN()
elif model == 'unet':
print('Model: {}'.format(model))
with tf.device('/cpu:0'):
if api=='functional':
unetModel = unet2.UnetFunc()
model = unetModel.unet()
elif api=='subclass':
model = unetsc.UnetSC(filters=filters)
model.build((1, imgDims, imgDims, 3))
elif model == 'unetmini':
print('Model: {}'.format(model))
with tf.device('/cpu:0'):
if api == 'functional':
unetminiModel = UnetMini(filters=filters)
model = unetminiModel.unetmini()
elif api=='subclass':
model = UnetMini(filters)
elif model == 'resunet':
print('Model: {}'.format(model))
with tf.device('/cpu:0'):
if api=='functional':
resunetModel = ResUnet(filters)
model = resunetModel.ResUnetFunc()
elif api=='subclass':
model = ResunetSc(filters)
elif model == 'resunet-a':
print('Model: {}'.format(model))
with tf.device('/cpu:0'):
if api=='functional':
resunetModel = ResUnetA(filters)
model = resunetModel.ResUnetAFunc()
elif api=='subclass':
model = ResunetASc(filters)
elif model == 'attention':
print('Model: {}'.format(model))
with tf.device('/cpu:0'):
if api == 'functional':
attenModel = AttenUnetFunc(filters)
model = attenModel.attenUnet()
elif api=='subclass':
model = AttenUnetSC(filters)
else:
raise ValueError('No model requested, please update config file')
# print('trainable variables', str(model.trainable_variables))
trainer = train.Train(model, loss, optimizer, strategy, epoch, batchSize)
trainDistDataset = strategy.experimental_distribute_dataset(trainDataset)
validDistDataset = strategy.experimental_distribute_dataset(validDataset)
model, history = trainer.forward(trainDistDataset, validDistDataset, trainSteps, validSteps)
And the subclassed unet model as follows:子类 unet model 如下:
class UnetSC(Model):
def __init__(self, filters=[16,32,64,128, 256], finalActivation='sigmoid', activation='relu',
nOutput=1, kSize=(3,3), pSize=(2,2), dropout=0, normalize=True, padding='same', dtype='float32'):
super(UnetSC, self).__init__(dtype=dtype)
self.normalize = normalize
self.conve1_1 = Conv2D(filters[0], kSize, activation='relu', padding='same', name='greg')
self.batchnorm1 = BatchNormalization(name='greggggggg')
self.conve1_2 = Conv2D(filters[0], kSize, activation='relu', padding='same')
self.batchnorm2 = BatchNormalization()
self.pool1 = MaxPooling2D((2, 2))
self.conve2_1 = Conv2D(filters[1], kSize, activation='relu', padding='same')
self.batchnorm3 = BatchNormalization()
self.conve2_2 = Conv2D(filters[1], kSize, activation='relu', padding='same')
self.batchnorm4 = BatchNormalization()
self.pool2 = MaxPooling2D((2, 2))
self.conve3_1 = Conv2D(filters[2], kSize, activation='relu', padding='same')
self.batchnorm5 = BatchNormalization()
self.conve3_2 = Conv2D(filters[2], kSize, activation='relu', padding='same')
self.batchnorm6 = BatchNormalization()
self.pool3 = MaxPooling2D((2, 2))
self.conve4_1 = Conv2D(filters[3], kSize, activation='relu', padding='same')
self.batchnorm7 = BatchNormalization()
self.conve4_2 = Conv2D(filters[3], kSize, activation='relu', padding='same', name='finalencoder')
self.batchnorm8 = BatchNormalization()
self.pool4 = MaxPooling2D((2, 2))
self.convb_1 = Conv2D(filters[4], kSize, activation='relu', padding='same')
self.batchnorm9 = BatchNormalization()
self.convb_2 = Conv2D(filters[4], kSize, activation='relu', padding='same')
self.batchnorm10 = BatchNormalization()
self.upsampling1 = UpSampling2D((2, 2))
self.conc1 = Concatenate()
self.convd1_1 = Conv2D(filters[3], kSize, activation='relu', padding='same')
self.batchnorm11 = BatchNormalization()
self.convd1_2 = Conv2D(filters[3], kSize, activation='relu', padding='same')
self.batchnorm12 = BatchNormalization()
self.upsampling2 = UpSampling2D((2, 2))
self.conc2 = Concatenate()
self.convd2_1 = Conv2D(filters[2], kSize, activation='relu', padding='same')
self.batchnorm13 = BatchNormalization()
self.convd2_2 = Conv2D(filters[2], kSize, activation='relu', padding='same')
self.batchnorm14 = BatchNormalization()
self.upsampling3 = UpSampling2D((2, 2))
self.conc3 = Concatenate()
self.convd3_1 = Conv2D(filters[1], kSize, activation='relu', padding='same')
self.batchnorm15 = BatchNormalization()
self.convd3_2 = Conv2D(filters[1], kSize, activation='relu', padding='same')
self.batchnorm16 = BatchNormalization()
self.upsampling4 = UpSampling2D((2, 2))
self.conc4 = Concatenate()
self.convd4_1 = Conv2D(filters[0], kSize, activation='relu', padding='same')
self.batchnorm17 = BatchNormalization()
self.convd4_2 = Conv2D(filters[0], kSize, activation='relu', padding='same')
self.batchnorm18 = BatchNormalization()
self.final = Conv2D(nOutput, kernel_size=(1, 1), strides=(1, 1), activation=finalActivation)
def call(self, x, training=True):
e1 = self.conve1_1(x)
e1 = self.batchnorm1(e1)
e1 = self.conve1_2(e1)
e1 = self.batchnorm2(e1)
p1 = self.pool1(e1)
e2 = self.conve2_1(p1)
e2 = self.batchnorm3(e2)
e2 = self.conve2_2(e2)
e2 = self.batchnorm4(e2)
p2 = self.pool2(e2)
e3 = self.conve3_1(p2)
e3 = self.batchnorm5(e3)
e3 = self.conve3_2(e3)
e3 = self.batchnorm6(e3)
p3 = self.pool3(e3)
e4 = self.conve4_1(p3)
e4 = self.batchnorm7(e4)
e4 = self.conve4_2(e4)
e4 = self.batchnorm8(e4)
p4 = self.pool4(e4)
b = self.convb_1(p4)
b = self.batchnorm9(b)
b = self.convb_2(b)
b = self.batchnorm10(b)
d1 = self.upsampling1(b)
d1 = self.conc1([e4, d1])
d1 = self.convd1_1(d1)
d1 = self.batchnorm11(d1)
d1 = self.convd1_2(d1)
d1 = self.batchnorm12(d1)
d2 = self.upsampling2(d1)
d2 = self.conc2([e3, d2])
d2 = self.convd2_1(d2)
d2 = self.batchnorm13(d2)
d2 = self.convd2_2(d2)
d2 = self.batchnorm14(d2)
d3 = self.upsampling3(d2)
d3 = self.conc3([e2, d3])
d3 = self.convd3_1(d3)
d3 = self.batchnorm15(d3)
d3 = self.convd3_2(d3)
d3 = self.batchnorm16(d3)
d4 = self.upsampling4(d3)
d4 = self.conc4([e1, d4])
d4 = self.convd4_1(d4)
d4 = self.batchnorm17(d4)
d4 = self.convd4_2(d4)
d4 = self.batchnorm18(d4)
x = self.final(d4)
return x
u = UnetSC()
u = u.build((1, 256,256,3))
The error output trace
Using TensorFlow backend.
Now executing following model: unet_32_adam_diceloss_FR_0_2.5x_germ_32
2020-06-12 18:14:00.672680: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2020-06-12 18:14:00.815119: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1561] Found device 0 with properties:
pciBusID: 0000:3f:00.0 name: Tesla V100-PCIE-16GB computeCapability: 7.0
coreClock: 1.38GHz coreCount: 80 deviceMemorySize: 15.75GiB deviceMemoryBandwidth: 836.37GiB/s
2020-06-12 18:14:00.816539: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1561] Found device 1 with properties:
pciBusID: 0000:40:00.0 name: Tesla V100-PCIE-16GB computeCapability: 7.0
coreClock: 1.38GHz coreCount: 80 deviceMemorySize: 15.75GiB deviceMemoryBandwidth: 836.37GiB/s
2020-06-12 18:14:00.817342: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2020-06-12 18:14:00.820640: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10
2020-06-12 18:14:00.823040: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcufft.so.10
2020-06-12 18:14:00.823833: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcurand.so.10
2020-06-12 18:14:00.826794: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusolver.so.10
2020-06-12 18:14:00.829026: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusparse.so.10
2020-06-12 18:14:00.834643: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7
2020-06-12 18:14:00.839962: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1703] Adding visible gpu devices: 0, 1
2020-06-12 18:14:00.840532: I tensorflow/core/platform/cpu_feature_guard.cc:143] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 AVX512F FMA
2020-06-12 18:14:00.855173: I tensorflow/core/platform/profile_utils/cpu_utils.cc:102] CPU Frequency: 2200000000 Hz
2020-06-12 18:14:00.857769: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x58fdc10 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2020-06-12 18:14:00.857804: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version
2020-06-12 18:14:01.277928: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x59680f0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2020-06-12 18:14:01.278008: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Tesla V100-PCIE-16GB, Compute Capability 7.0
2020-06-12 18:14:01.278031: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (1): Tesla V100-PCIE-16GB, Compute Capability 7.0
2020-06-12 18:14:01.284602: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1561] Found device 0 with properties:
pciBusID: 0000:3f:00.0 name: Tesla V100-PCIE-16GB computeCapability: 7.0
coreClock: 1.38GHz coreCount: 80 deviceMemorySize: 15.75GiB deviceMemoryBandwidth: 836.37GiB/s
2020-06-12 18:14:01.291638: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1561] Found device 1 with properties:
pciBusID: 0000:40:00.0 name: Tesla V100-PCIE-16GB computeCapability: 7.0
coreClock: 1.38GHz coreCount: 80 deviceMemorySize: 15.75GiB deviceMemoryBandwidth: 836.37GiB/s
2020-06-12 18:14:01.291808: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2020-06-12 18:14:01.291883: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10
2020-06-12 18:14:01.291935: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcufft.so.10
2020-06-12 18:14:01.291988: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcurand.so.10
2020-06-12 18:14:01.292039: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusolver.so.10
2020-06-12 18:14:01.292086: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusparse.so.10
2020-06-12 18:14:01.292151: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7
2020-06-12 18:14:01.304148: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1703] Adding visible gpu devices: 0, 1
2020-06-12 18:14:01.304295: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2020-06-12 18:14:01.312107: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1102] Device interconnect StreamExecutor with strength 1 edge matrix:
2020-06-12 18:14:01.312143: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1108] 0 1
2020-06-12 18:14:01.312164: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1121] 0: N Y
2020-06-12 18:14:01.312180: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1121] 1: Y N
2020-06-12 18:14:01.318105: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1247] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 14864 MB memory) -> physical GPU (device: 0, name: Tesla V100-PCIE-16GB, pci bus id: 0000:3f:00.0, compute capability: 7.0)
2020-06-12 18:14:01.320434: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1247] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:1 with 14864 MB memory) -> physical GPU (device: 1, name: Tesla V100-PCIE-16GB, pci bus id: 0000:40:00.0, compute capability: 7.0)
Epoch: 1/40...WARNING:tensorflow:Using MirroredStrategy eagerly has significant overhead currently. We will be working on improving this in the future, but for now please wrap `call_for_each_replica` or `experimental_run` or `run` inside a tf.function to get the best performance.
2020-06-12 18:14:16.135798: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7
2020-06-12 18:14:18.493751: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10
74
74
74
74
Then it just sticks here.然后它就粘在这里。 Please help!
请帮忙!
I would have left this as a comment, but don't have enough reputation.我会将此作为评论留下,但没有足够的声誉。 Have you tried without all those
@tf.function
decorators, is it the same?您是否尝试过没有所有这些
@tf.function
装饰器,是一样的吗? And if the problem is in the trainStep
function, maybe you could try variants, I think that possibly you need to have trainable_variables passed as an argument to that function.如果问题出在
trainStep
function 中,也许您可以尝试变体,我认为您可能需要将 trainable_variables 作为参数传递给 function。
I managed to solve this myself, here is the whole code snippet to train my network which uses a mirror strategy to run the training on multiple gpus.我自己设法解决了这个问题,这是训练我的网络的整个代码片段,它使用镜像策略在多个 GPU 上运行训练。 This calculates the per replica loss and dice score for each gpu and sums them accordingly.
这将计算每个 gpu 的每个副本损失和骰子分数,并将它们相应地求和。 Note in the compute loss, the correct way to calculate the loss over all pixels and divide by the total dimensions to get the average loss per image (we get a loss value for each pixel).
请注意,在计算损失中,计算所有像素损失并除以总尺寸以获得每张图像的平均损失的正确方法(我们得到每个像素的损失值)。
import os
import tensorflow as tf
import numpy as np
import datetime
import tensorflow.keras.backend as K
from tensorflow.keras.utils import Progbar
from custom_loss_classes import WeightedBinaryCrossEntropy
from evaluation import diceCoef
#import memory_saving_gradients
#tf.__dict__["gradients"] = memory_saving_gradients.gradients_speed
class DistributeTrain():
def __init__(self, epochs, model, optimizer, lossObject, batchSize,
strategy, trainSteps, testNum, imgDims, threshold, modelName, currentTime, currentDate):
self.epochs = epochs
self.batchSize = batchSize
self.strategy = strategy
self.loss_object = lossObject
self.optimizer = optimizer
self.metric = diceCoef
self.model = model
self.trainSteps = trainSteps
self.testNum = testNum
self.imgDims = imgDims
self.history = {'trainloss': [], 'trainmetric':[], 'valmetric': [],'valloss':[]}
self.threshold = threshold
self.modelName = modelName
self.currentTime = currentTime
self.currentDate = currentDate
def computeLoss(self, label, predictions):
loss = self.loss_object(label, predictions)
print('loss', loss)
loss = tf.reduce_sum(loss) * (1. / (self.imgDims*self.imgDims*self.batchSize))
return loss * (1/self.strategy.num_replicas_in_sync)
def computeDice(self, yTrue, yPred):
dice = self.metric(yTrue, yPred)
dice = dice * (1 / self.strategy.num_replicas_in_sync)
return dice
def trainStep(self, inputs):
x, y = inputs
with tf.GradientTape() as tape:
logits = self.model(x, training=True)
loss = self.computeLoss(y, logits)
yPred = tf.cast((logits > 0.5), tf.float32)
dice = self.computeDice(y, yPred)
gradients = tape.gradient(loss, self.model.trainable_variables)
self.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))
return loss, dice
def testStep(self, inputs):
x, y = inputs
predictions = self.model(x, training=False)
loss = self.loss_object(y, predictions)
yPred = tf.cast((predictions > 0.5), tf.float32)
#print('y',np.unique(y.numpy()))
#print('yPred', np.unique(yPred.numpy()))
dice = self.computeDice(y, yPred)
loss = tf.reduce_sum(loss) * (1. / (self.imgDims*self.imgDims*self.batchSize))
return loss, dice
@tf.function
def distributedTrainEpoch(self, batch):
#totalLoss = 0.0
#totalDice = 0.0
#i = 0
#prog = Progbar(self.trainSteps-1)
#for batch in trainData:
#i+=1
replicaLoss, replicaDice = self.strategy.run(self.trainStep, args=(batch,))
# totalLoss += self.strategy.reduce(tf.distribute.ReduceOp.SUM, replicaLoss, axis=None)
# totalDice += self.strategy.reduce(tf.distribute.ReduceOp.SUM, replicaDice, axis=None)
#prog.update(i)
#return totalLoss, totalDice
return replicaLoss, replicaDice
#ToDo: shitty hack to include progbar in distributed train function. need a
#way of converting tensor i to integer
def getDistTrainEpoch(self, trainData):
totalLoss = 0.0
totalDice = 0.0
i = 0
prog = Progbar(self.trainSteps-1)
for batch in trainData:
replicaLoss, replicaDice = self.distributedTrainEpoch(batch)
totalLoss += self.strategy.reduce(tf.distribute.ReduceOp.SUM, replicaLoss, axis=None)
totalDice += self.strategy.reduce(tf.distribute.ReduceOp.SUM, replicaDice, axis=None)
prog.update(i)
i+=1
return totalLoss, totalDice
@tf.function
def distributedTestEpoch(self, validData):
totalLoss = 0.0
totalDice = 0.0
for d in validData:
loss, dice = self.strategy.run(self.testStep, args=(d,))
totalLoss += self.strategy.reduce(tf.distribute.ReduceOp.SUM, loss, axis=None)
totalDice += self.strategy.reduce(tf.distribute.ReduceOp.SUM, dice, axis=None)
return totalLoss, totalDice
#we wantt o stop on a moving average value, min threshold dice and min epoch iterations
def earlyStop(self, valDice, epoch):
ma = np.mean(np.array(self.history['valmetric'][-5:]))
#removed moving average
stop = True if epoch > self.threshold['epochs'] and valDice > self.threshold['metric'] else False
return stop
def forward(self, trainDistDataset, testDistDataset):
currentTime = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
trainLogDir = os.path.join('tensorboard_logs', 'train', self.currentDate, self.modelName + '_' + self.currentTime)
testLogDir = os.path.join('tensorboard_logs', 'test', self.currentDate, self.modelName + '_' + self.currentTime)
trainWriter = tf.summary.create_file_writer(trainLogDir)
testWriter = tf.summary.create_file_writer(testLogDir)
for epoch in range(self.epochs):
#trainLoss, trainDice = self.distributedTrainEpoch(trainDistDataset)
trainLoss, trainDice = self.getDistTrainEpoch(trainDistDataset)
epochTrainLoss, epochTrainDice = float(trainLoss/self.trainSteps), float(trainDice/self.trainSteps)
with trainWriter.as_default():
tf.summary.scalar('loss', epochTrainLoss, step=epoch)
tf.summary.scalar('dice', epochTrainDice, step=epoch)
tf.print(' Epoch: {}/{}, loss - {:.2f}, dice - {:.2f}, lr - {:.5f}'.format(epoch+1, self.epochs, epochTrainLoss,
epochTrainDice, 1), end="")
testLoss, testDice = self.distributedTestEpoch(testDistDataset)
epochTestLoss, epochTestDice = float(testLoss/self.testNum), float(testDice/self.testNum)
with testWriter.as_default():
tf.summary.scalar('loss', epochTestLoss, step=epoch)
tf.summary.scalar('Dice', epochTestDice, step=epoch)
tf.print(' val_loss - {:.3f}, val_dice - {:.3f}'.format(epochTestLoss, epochTestDice))
self.history['trainmetric'].append(epochTrainDice)
self.history['trainloss'].append(epochTrainLoss)
self.history['valmetric'].append(epochTestDice)
self.history['valloss'].append(epochTestLoss)
if self.earlyStop(epochTestDice, epoch):
print('Stopping early on epoch: {}'.format(epoch))
break
return self.model, self.history
Try to comment out the code that sets eager mode.尝试注释掉设置急切模式的代码。
# tf.config.experimental_run_functions_eagerly(True)
I've suffered from the exact same problem.我遇到了完全相同的问题。 My model with MirroredStrategy worked well with single GPU but got stuck with multiple GPUs at optimizer.apply_gradients(), more specifically, at _merge_call() (infinitely waiting at t.should_run.wait() inside this function).
我的带有 MirroredStrategy 的 model 与单个 GPU 配合良好,但在 optimizer.apply_gradients() 处,更具体地说,在 _merge_call() 处被多个 GPU 卡住(在此函数内无限等待 t.should_run.wait() 处)。 This had happened when I turned eager mode on with:
当我打开急切模式时发生了这种情况:
tf.config.run_functions_eagerly(True)
After comment out this, my model worked well with multiple GPUs.注释掉这一点后,我的 model 在多个 GPU 上运行良好。 This seems to be related to the warning message:
这似乎与警告消息有关:
Epoch: 1/40...WARNING:tensorflow:Using MirroredStrategy eagerly has significant overhead currently.
Epoch: 1/40...WARNING:tensorflow:Using MirroredStrategy 目前有很大的开销。 We will be working on improving this in the future, but for now please wrap
call_for_each_replica
orexperimental_run
orrun
inside a tf.function to get the best performance.我们将在未来努力改进这一点,但现在请包装
call_for_each_replica
或 Experimental_run 或在experimental_run
中run
以获得最佳性能。
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.