I am trying to make a LSTM VAE as a learning stage for future work with pytorch. I managed to get it to work on some small tester data but now that I want to run it on my actual data I am continuously getting this error:
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [10, 40]], which is output 0 of TBackward, is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
Any help on how to solve this error and generally just make my code more efficient would be great! Here is the appropriate section of the traceback:
File "<ipython-input-16-7fe0e9e30e5d>", line 190, in <module>
rec_loss, kl_loss = train_batch(x,G_inp,epoch,train=True)
File "<ipython-input-16-7fe0e9e30e5d>", line 166, in train_batch
reconstruction, hidden, kld = model(x, G_inp, None, None)
File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "<ipython-input-16-7fe0e9e30e5d>", line 93, in forward
mu, logvar, z = self.encoder(x)
File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "<ipython-input-16-7fe0e9e30e5d>", line 37, in forward
out1, self.hidden = self.lstm(x, self.hidden)
File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/rnn.py", line 582, in forward
self.dropout, self.training, self.bidirectional, self.batch_first)
(Triggered internally at /pytorch/torch/csrc/autograd/python_anomaly_mode.cpp:104.)
allow_unreachable=True) # allow_unreachable flag
The code is below: (excuse all the.clone(), I read that this could be a solution so I was testing it out everywhere with no help)
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
from keras.preprocessing.text import Tokenizer
from keras import preprocessing
import torch.nn.functional as F
import math
import random
#encoder
class Encoder(nn.Module):
def __init__(self,embedding_dim, vocab_size, n_layers_E, n_hidden_E, dim_z, n_hidden_G, batch_size):
super(Encoder,self).__init__()
self.n_layers_E = n_layers_E
self.n_hidden_E = n_hidden_E
self.batch_size = batch_size
self.dim_z = dim_z
self.lstm = nn.LSTM(input_size = embedding_dim, hidden_size = n_hidden_E, num_layers = n_layers_E, batch_first=True, bidirectional = True)
self.hidden_to_mu = nn.Linear(2*n_hidden_E,dim_z)
self.hidden_to_logvar = nn.Linear(2*n_hidden_G, dim_z)
self.hidden = (torch.zeros(2*n_layers_E, batch_size, n_hidden_E),torch.zeros(2*n_layers_E, batch_size, n_hidden_E))
def forward(self,x):
batch_size, n_seq, n_embed = x.size()
#batch_size, n_seq = x.size()
out1, self.hidden = self.lstm(x, self.hidden)
e_hidden = self.hidden[0].view(batch_size, 2 * self.n_hidden_E).clone()
#e_hidden = torch.cat(list(hidden),dim = 0)
mu = self.hidden_to_mu(e_hidden)
logvar = self.hidden_to_logvar(e_hidden)
epsilon = torch.randn([batch_size, self.dim_z])
z = mu + torch.exp(logvar*0.5)*epsilon
return mu, logvar, z
class Generator(nn.Module):
def __init__(self,n_hidden_G,n_layers_G, embedding_dim, dim_z, vocab_size , batch_size):
super(Generator,self).__init__()
self.n_hidden_G = n_hidden_G
self.n_layers_G = n_layers_G
self.n_z = dim_z
self.batch_size = batch_size
self.LSTM = nn.LSTM(input_size = embedding_dim + dim_z, hidden_size = n_hidden_G, num_layers = n_layers_G, batch_first = True)
self.fc = nn.Linear(n_hidden_G, vocab_size)
self.hidden = (torch.zeros(self.n_layers_G, batch_size, self.n_hidden_G)
,torch.zeros(self.n_layers_G, batch_size, self.n_hidden_G))
def forward(self,x,z, g_hidden = None):
batch_size,n_seq, n_embed = x.size()
#batch_size, n_seq= x.size()
z = torch.cat([z]*n_seq,1).view(batch_size, n_seq, self.n_z)
x = torch.cat([x,z], dim = 2)
if g_hidden is None: #if we are validating
self.hidden = (torch.zeros(self.n_layers_G, batch_size, self.n_hidden_G)
,torch.zeros(self.n_layers_G, batch_size, self.n_hidden_G))
else: #if we are training
self.hidden = g_hidden
output, self.hidden = self.LSTM(x, self.hidden)
output = self.fc(output)
return output, self.hidden
class VAE(nn.Module):
def __init__(self, embedding_dim, vocab_size, n_layers_E, n_hidden_E, dim_z, n_hidden_G, batch_size,n_layers_G ):
super(VAE, self).__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.encoder = Encoder(embedding_dim, vocab_size, n_layers_E, n_hidden_E, dim_z, n_hidden_G, batch_size)
self.generator = Generator(n_hidden_G,n_layers_G, embedding_dim, dim_z, vocab_size, batch_size )
self.n_z = dim_z
def forward(self, x, G_inp, z, G_hidden):
if z is None:
batch_size, n_seq = x.size()
x = self.embedding(x)
mu, logvar, z = self.encoder(x)
kld = -0.5*torch.sum(logvar-mu.pow(2)-logvar.exp()+1).mean()
else:
kld = None
G_inp = self.embedding(G_inp)
logit, G_hidden = self.generator(G_inp,z, G_hidden)
return logit, G_hidden, kld
train_df = pd.read_csv("train.csv", header =None)[0:500]
test_df = pd.read_csv("test.csv",header =None)[0:500]
train = train_df.iloc[:,0]
max_words = 2000
max_len = 25
tok = Tokenizer(num_words = max_words)
tok.fit_on_texts(train)
sequences = tok.texts_to_sequences(train)
sequences_matrix = preprocessing.sequence.pad_sequences(sequences, maxlen = max_len)
#tok.sequences_to_texts(sequences)
n_hidden_E = 10
n_layers_E = 1
embedding_dim = 10
vocab_size = max_words
n_hidden_G = 10
n_layers_G = 2
dim_z = 10
train_size = len(train)
batch_size = 100
rec_coef = 7
lr = 0.01
epochs = 100
def create_generator_input(x, train):
G_inp = x[:, 0:max_len-1].clone() #input for generator should exclude last word of sequence
# if train == False:
# return G_inp
# r = np.random.rand(G_inp.size(0), G_inp.size(1))
# #Perform word_dropout according to random values (r) generated for each word
# for i in range(len(G_inp)):
# for j in range(1,G_inp.size(1)):
# if r[i, j] < opt.word_dropout and G_inp[i, j] not in [vocab.stoi[opt.pad_token], vocab.stoi[opt.end_token]]:
# G_inp[i, j] = vocab.stoi[opt.unk_token]
return G_inp
def producebatches(x,batch_size):
k = math.floor(x.shape[0]/batch_size)
total = (k)*batch_size
flatten = x[0:total].flatten()
batches = flatten.reshape((k,batch_size,x.shape[1]))
return batches
batches = producebatches(sequences_matrix, batch_size)
model = VAE(embedding_dim, vocab_size, n_layers_E, n_hidden_E, dim_z, n_hidden_G, batch_size,n_layers_G)
optimizer = torch.optim.Adam(model.parameters(), lr = lr)
criterion = nn.BCELoss(reduction = 'sum')
model.train()
def train_batch(x,G_inp,step,train =True):
reconstruction, hidden, kld = model(x, G_inp, None, None)
reconstruction2 = reconstruction.view(-1, vocab_size).clone() #converting into shape (batch_size*(n_seq-1), n_vocab) to facilitate performing F.cross_entropy()
#y = x[:, 1:x.size(1)].clone() #target for generator should exclude first word of sequence
#y = y.contiguous().view(-1)
G_inp2 = G_inp.contiguous().view(-1) #converting into shape (batch_size*(n_seq-1),1) to facilitate performing F.cross_entropy()
rec_loss = F.cross_entropy(reconstruction2,G_inp2)
kld_coef = (math.tanh((step - 15000)/1000) + 1) / 2
#kld_coef = min(1,step/(200000.0))
loss = rec_coef*rec_loss + kld_coef*kld
if train == True:
torch.autograd.set_detect_anomaly(True) #skip below step if we are performing validation
optimizer.zero_grad()
loss.backward(retain_graph=True)
optimizer.step()
return rec_loss.item(), kld.item()
for epoch in range(epochs):
train_rec_loss = []
train_kl_loss = []
for i in range(batches.shape[0]):
x = torch.tensor(batches[i], dtype = torch.long)
G_inp = create_generator_input(x, train = True)
rec_loss, kl_loss = train_batch(x,G_inp,epoch,train=True)
train_rec_loss.append(rec_loss)
train_kl_loss.append(kl_loss)
train_rec_loss = np.mean(train_rec_loss)
train_kl_loss = np.mean(train_kl_loss)
print("No.", epoch, "T_rec:", '%.2f'%rec_loss, "T_kld:", '%.2f'%kl_loss)
def generate_sentences(n):
model.eval()
sentences = []
for i in range(n):
z = torch.randn([1, dim_z])
hidden = (torch.zeros(n_layers_G,1, n_hidden_G)
,torch.zeros(n_layers_G, 1, n_hidden_G))
G_inp = torch.LongTensor(1,1).fill_(1)
str_ind = []
while len(str_ind)<49:
with torch.autograd.no_grad():
logit, G_hidden, _ = model(None, G_inp, z, hidden)
probs = F.softmax(logit[0],dim=1)
G_inp = torch.multinomial(probs,1)
str_ind.append(G_inp[0][0].item())
sentences.append(str_ind)
return sentences
t = generate_sentences(1)
First, you can re-initialize your hidden layer after each epoch. This will overcome the error that you are facing without any major changes:
for epoch in range(epochs):
train_rec_loss = []
train_kl_loss = []
for i in range(batches.shape[0]):
x = torch.tensor(batches[i], dtype = torch.long)
G_inp = create_generator_input(x, train = True)
rec_loss, kl_loss = train_batch(x,G_inp,epoch,train=True)
train_rec_loss.append(rec_loss)
train_kl_loss.append(kl_loss)
model.hidden = (torch.zeros(n_layers_G, batch_size, n_hidden_G)
,torch.zeros(n_layers_G, batch_size, n_hidden_G))
train_rec_loss = np.mean(train_rec_loss)
train_kl_loss = np.mean(train_kl_loss)
print("No.", epoch, "T_rec:", '%.2f'%rec_loss, "T_kld:", '%.2f'%kl_loss)
Furthermore, you can avoid inplace operations on the activation functions and dropout operations (inplace = False)
(I think it is not your case).
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.