Pytorch LSTM- VAE Sentence Generator: RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation

Question

I am trying to make a LSTM VAE as a learning stage for future work with pytorch. I managed to get it to work on some small tester data but now that I want to run it on my actual data I am continuously getting this error:

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [10, 40]], which is output 0 of TBackward, is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!

Any help on how to solve this error and generally just make my code more efficient would be great! Here is the appropriate section of the traceback:

File "<ipython-input-16-7fe0e9e30e5d>", line 190, in <module>
    rec_loss, kl_loss = train_batch(x,G_inp,epoch,train=True)
  File "<ipython-input-16-7fe0e9e30e5d>", line 166, in train_batch
    reconstruction, hidden, kld = model(x, G_inp, None, None)
  File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "<ipython-input-16-7fe0e9e30e5d>", line 93, in forward
    mu, logvar, z = self.encoder(x)
  File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "<ipython-input-16-7fe0e9e30e5d>", line 37, in forward
    out1, self.hidden  = self.lstm(x, self.hidden)
  File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/rnn.py", line 582, in forward
    self.dropout, self.training, self.bidirectional, self.batch_first)
 (Triggered internally at  /pytorch/torch/csrc/autograd/python_anomaly_mode.cpp:104.)
  allow_unreachable=True)  # allow_unreachable flag

The code is below: (excuse all the.clone(), I read that this could be a solution so I was testing it out everywhere with no help)

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim

from keras.preprocessing.text import Tokenizer
from keras import preprocessing
import torch.nn.functional as F
import math
import random




#encoder
class Encoder(nn.Module):
    def __init__(self,embedding_dim, vocab_size,  n_layers_E, n_hidden_E, dim_z, n_hidden_G, batch_size):
        super(Encoder,self).__init__()
    
        self.n_layers_E = n_layers_E
        self.n_hidden_E = n_hidden_E
        self.batch_size = batch_size
        self.dim_z = dim_z
        
        self.lstm = nn.LSTM(input_size = embedding_dim, hidden_size = n_hidden_E, num_layers = n_layers_E, batch_first=True, bidirectional = True)
        self.hidden_to_mu = nn.Linear(2*n_hidden_E,dim_z)
        self.hidden_to_logvar = nn.Linear(2*n_hidden_G, dim_z)
        
        self.hidden = (torch.zeros(2*n_layers_E, batch_size, n_hidden_E),torch.zeros(2*n_layers_E, batch_size, n_hidden_E))
        
    def forward(self,x):
        
        batch_size, n_seq, n_embed = x.size()
        #batch_size, n_seq = x.size()
        out1, self.hidden  = self.lstm(x, self.hidden)
        e_hidden = self.hidden[0].view(batch_size, 2 * self.n_hidden_E).clone()

        #e_hidden = torch.cat(list(hidden),dim = 0)

        mu = self.hidden_to_mu(e_hidden)
        logvar = self.hidden_to_logvar(e_hidden)
        epsilon = torch.randn([batch_size, self.dim_z]) 
        z = mu + torch.exp(logvar*0.5)*epsilon 
        
        return mu, logvar, z
    
class Generator(nn.Module):
    def __init__(self,n_hidden_G,n_layers_G, embedding_dim, dim_z, vocab_size , batch_size):
        super(Generator,self).__init__()
    
        self.n_hidden_G =  n_hidden_G
        self.n_layers_G = n_layers_G
        self.n_z = dim_z
        self.batch_size = batch_size
        self.LSTM = nn.LSTM(input_size = embedding_dim + dim_z, hidden_size = n_hidden_G, num_layers = n_layers_G, batch_first = True)
        self.fc = nn.Linear(n_hidden_G, vocab_size)
        
        self.hidden = (torch.zeros(self.n_layers_G, batch_size, self.n_hidden_G)
                       ,torch.zeros(self.n_layers_G, batch_size, self.n_hidden_G))
        
    def forward(self,x,z, g_hidden = None):
        
        batch_size,n_seq, n_embed = x.size()
        #batch_size, n_seq= x.size()
        z = torch.cat([z]*n_seq,1).view(batch_size, n_seq, self.n_z)
        x = torch.cat([x,z], dim = 2)
        
        if g_hidden is None:                                        #if we are validating
            self.hidden = (torch.zeros(self.n_layers_G, batch_size, self.n_hidden_G)
                       ,torch.zeros(self.n_layers_G, batch_size, self.n_hidden_G))
        else:                                                       #if we are training
            self.hidden = g_hidden
            
        output, self.hidden = self.LSTM(x, self.hidden)
        output = self.fc(output)
        
        return output, self.hidden
    
class VAE(nn.Module):
    def __init__(self, embedding_dim, vocab_size,  n_layers_E, n_hidden_E, dim_z, n_hidden_G, batch_size,n_layers_G ):
        super(VAE, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.encoder = Encoder(embedding_dim, vocab_size,  n_layers_E, n_hidden_E, dim_z, n_hidden_G, batch_size)
        self.generator = Generator(n_hidden_G,n_layers_G, embedding_dim, dim_z, vocab_size, batch_size )
        self.n_z = dim_z
        
    def forward(self, x, G_inp, z, G_hidden):
        if z is None:
            batch_size, n_seq = x.size()
            x = self.embedding(x)
            mu, logvar, z = self.encoder(x)
        
            kld = -0.5*torch.sum(logvar-mu.pow(2)-logvar.exp()+1).mean()
        else:
            kld = None
        G_inp = self.embedding(G_inp)
        
        logit, G_hidden = self.generator(G_inp,z, G_hidden)
        return logit, G_hidden, kld



train_df = pd.read_csv("train.csv", header =None)[0:500]
test_df = pd.read_csv("test.csv",header =None)[0:500]

train = train_df.iloc[:,0]

max_words = 2000
max_len = 25


tok = Tokenizer(num_words = max_words)
tok.fit_on_texts(train)
sequences = tok.texts_to_sequences(train)
sequences_matrix = preprocessing.sequence.pad_sequences(sequences, maxlen = max_len)

#tok.sequences_to_texts(sequences)


n_hidden_E = 10
n_layers_E = 1
embedding_dim = 10
vocab_size = max_words
n_hidden_G = 10
n_layers_G = 2
dim_z = 10 
train_size = len(train)
batch_size = 100
rec_coef = 7

lr = 0.01
epochs = 100

def create_generator_input(x, train):
    G_inp = x[:, 0:max_len-1].clone()                       #input for generator should exclude last word of sequence
    # if train == False:
    #     return G_inp

    # r = np.random.rand(G_inp.size(0), G_inp.size(1))
    #                                                         #Perform word_dropout according to random values (r) generated for each word
    # for i in range(len(G_inp)):
    #     for j in range(1,G_inp.size(1)):
    #         if r[i, j] < opt.word_dropout and G_inp[i, j] not in [vocab.stoi[opt.pad_token], vocab.stoi[opt.end_token]]:
    #             G_inp[i, j] = vocab.stoi[opt.unk_token]

    return G_inp

def producebatches(x,batch_size):
    k = math.floor(x.shape[0]/batch_size)
    total = (k)*batch_size
    flatten = x[0:total].flatten()
    batches = flatten.reshape((k,batch_size,x.shape[1]))
    return batches

batches = producebatches(sequences_matrix, batch_size)      

model = VAE(embedding_dim, vocab_size,  n_layers_E, n_hidden_E, dim_z, n_hidden_G, batch_size,n_layers_G)
optimizer = torch.optim.Adam(model.parameters(), lr = lr)
criterion = nn.BCELoss(reduction = 'sum')

model.train()

def train_batch(x,G_inp,step,train  =True):
    reconstruction, hidden, kld = model(x, G_inp, None, None)
    reconstruction2 = reconstruction.view(-1, vocab_size).clone()                       #converting into shape (batch_size*(n_seq-1), n_vocab) to facilitate performing F.cross_entropy()
    #y = x[:, 1:x.size(1)].clone()                                  #target for generator should exclude first word of sequence
    #y = y.contiguous().view(-1)      
    G_inp2 = G_inp.contiguous().view(-1)                              #converting into shape (batch_size*(n_seq-1),1) to facilitate performing F.cross_entropy()
    rec_loss = F.cross_entropy(reconstruction2,G_inp2)
    kld_coef = (math.tanh((step - 15000)/1000) + 1) / 2
    #kld_coef = min(1,step/(200000.0))
    loss = rec_coef*rec_loss + kld_coef*kld
    if train == True:      
        torch.autograd.set_detect_anomaly(True)                                 #skip below step if we are performing validation
        optimizer.zero_grad()
        loss.backward(retain_graph=True)
        optimizer.step()
    return rec_loss.item(), kld.item()


for epoch in range(epochs):
    train_rec_loss = []
    train_kl_loss = []

    for i in range(batches.shape[0]):   
        x = torch.tensor(batches[i], dtype = torch.long)
        G_inp = create_generator_input(x, train = True)
        rec_loss, kl_loss = train_batch(x,G_inp,epoch,train=True)
        train_rec_loss.append(rec_loss)
        train_kl_loss.append(kl_loss)
        
        
    train_rec_loss = np.mean(train_rec_loss)
    train_kl_loss = np.mean(train_kl_loss)
    print("No.", epoch, "T_rec:", '%.2f'%rec_loss, "T_kld:", '%.2f'%kl_loss)
 
def generate_sentences(n):

    model.eval()
    sentences = []
    
    for i in range(n):
        z = torch.randn([1, dim_z])
        hidden = (torch.zeros(n_layers_G,1, n_hidden_G)
                       ,torch.zeros(n_layers_G, 1, n_hidden_G))
        G_inp = torch.LongTensor(1,1).fill_(1)
        str_ind = []
        while len(str_ind)<49:
            with torch.autograd.no_grad():
                logit, G_hidden, _ = model(None, G_inp, z, hidden)
            
            probs = F.softmax(logit[0],dim=1)
            G_inp = torch.multinomial(probs,1)
            str_ind.append(G_inp[0][0].item())
        sentences.append(str_ind)
    return sentences
t = generate_sentences(1)

Answer 1

First, you can re-initialize your hidden layer after each epoch. This will overcome the error that you are facing without any major changes:


for epoch in range(epochs):
    train_rec_loss = []
    train_kl_loss = []

    for i in range(batches.shape[0]):   
        x = torch.tensor(batches[i], dtype = torch.long)
        G_inp = create_generator_input(x, train = True)
        rec_loss, kl_loss = train_batch(x,G_inp,epoch,train=True)
        train_rec_loss.append(rec_loss)
        train_kl_loss.append(kl_loss)
    
    model.hidden = (torch.zeros(n_layers_G, batch_size, n_hidden_G)
                       ,torch.zeros(n_layers_G, batch_size, n_hidden_G))
        
    train_rec_loss = np.mean(train_rec_loss)
    train_kl_loss = np.mean(train_kl_loss)
    print("No.", epoch, "T_rec:", '%.2f'%rec_loss, "T_kld:", '%.2f'%kl_loss)

Furthermore, you can avoid inplace operations on the activation functions and dropout operations (inplace = False) (I think it is not your case).

Pytorch LSTM- VAE Sentence Generator: RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation

Question

1 answers

solution1
0 2021-03-15 10:15:02

Pytorch LSTM- VAE Sentence Generator: RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation

Question

1 answers

solution1 0 2021-03-15 10:15:02

solution1
0 2021-03-15 10:15:02