Here is my training loop
def train(model, train_dl, valid_dl, loss_fn, optimizer, scheduler, acc_fn, epochs=50):
start = time.time()
model.cuda()
train_loss, valid_loss = [], []
train_acc, valid_acc = [], []
best_acc = 0.0
for epoch in range(epochs):
print('Epoch {}/{}'.format(epoch, epochs - 1))
print('-' * 10)
time_epoch_start = time.time()
for phase in ['train', 'valid']:
if phase == 'train':
model.train(True) # Set trainind mode = true
dataloader = train_dl
else:
model.train(False) # Set model to evaluate mode
dataloader = valid_dl
running_loss = 0.0
running_acc = 0.0
step = 0
for x, y, _ in dataloader:
x = x.cuda()
y = y.cuda()
step += 1
if phase == 'train':
optimizer.zero_grad()
outputs = model(x)
loss = loss_fn(outputs, y)
loss.backward()
optimizer.step()
if scheduler is not None:
scheduler.step()
else:
with torch.no_grad():
outputs = model(x)
loss = loss_fn(outputs, y.long())
acc = acc_fn(outputs, y)
running_acc += acc * dataloader.batch_size
running_loss += loss * dataloader.batch_size
if step % 100 == 0:
print('Current step: {} Loss: {} Acc: {} AllocMem (Mb): {}'.format(step, loss, acc, torch.cuda.memory_allocated()/1024/1024))
epoch_loss = running_loss / len(dataloader.dataset)
epoch_acc = running_acc / len(dataloader.dataset)
train_loss.append(epoch_loss) if phase=='train' else valid_loss.append(epoch_loss)
train_acc.append(epoch_acc) if phase=='train' else valid_acc.append(epoch_acc)
if phase=='train':
train_loss_print = epoch_loss
train_acc_print = epoch_acc
else:
valid_loss_print = epoch_loss
valid_acc_print = epoch_acc
time_epoch = time.time() - time_epoch_start
print('Epoch {}/{} - TRAIN Loss: {:.4f} TRAIN Acc: {:.4f} - VAL. Loss: {:.4f} VAL. Acc: {:.4f} ({:.4f} seconds - {:.4f} Mb)'.format(epoch, epochs - 1, train_loss_print, train_acc_print, valid_loss_print, valid_acc_print, time_epoch, torch.cuda.memory_allocated()/1024/1024))
time_elapsed = time.time() - start
print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
return model, train_loss, valid_loss, train_acc, valid_acc
Here I define the optimizer and learning rate
opt = torch.optim.Adam(modelo.parameters(), lr=hp_lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)
lr_scheduler = lr_scheduler.StepLR(opt, step_size=10, gamma=0.1)
modelo_trained, train_loss, valid_loss, train_acc, valid_acc = train(modelo, train_dl, valid_dl, loss_fn, opt, lr_scheduler, acc_metric, epochs=num_epochs)
However, when training, here is my training loss [Graphic][1] [1]: https://i.stack.imgur.com/3Y1wL.png
The repeated pattern in loss seems to indicate that the model is not training and that their is an issue with the training loop in particular, because even an incorrect model would still see variance in the loss over time.
Welcome to stackoverflow!
You are calling model.cuda()
inside the training loop, but supplying the optimizer externally. Smells like the optimizer was initialized on the wrong parameters.
ps: let me know whether that makes sense and/or works
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.