无法找出 pytorch 中的就地操作错误

Question

I have written the following code and I cannot seem to find where is the in place operation error.我已经编写了以下代码，但我似乎无法找到就地操作错误在哪里。

The error that I am getting is我得到的错误是

Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor []], which is output 0 of AsStridedBackward0, is at version 1; expected version 0 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!

After setting anomaly detection, I am being pointed to this snippet of code设置异常检测后，我被指向这段代码

def create_S_n(self, n) -> torch.Tensor:
        '''Create the S_n matrix'''
        v = self.id_tensor_list(self.d)
        S_n = torch.zeros(self.d, self.d)
        for i in range(self.d):
            S_n = S_n + (torch.mul(torch.diag(v[i]), torch.exp(self.phi[i][2] * torch.log(torch.exp(self.phi[i][3]) + self.Y[n][i]))))
        return S_n

and the line和线

S_n = S_n + (torch.mul(torch.diag(v[i]), torch.exp(self.phi[i][2] * torch.log(torch.exp(self.phi[i][3]) + self.Y[n][i]))))

Can someone tell me where is the in place operation?有人可以告诉我就地操作在哪里吗？ and how do I overcome it?我该如何克服它？

Here is my entire code for reproducability:这是我的整个可重现性代码：

import random
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import scipy.stats as stats
import torch
from torch import tensor
from torch import diag
from torch import Size
from torch import inverse
import torch.optim as optim
torch.autograd.set_detect_anomaly(True)

class Count_SVI:
    '''A class that implements SVI'''
    def __init__(self, precision = 4, dtype=torch.float32):
        '''Initialize the model'''
        self.N = 0 # Number of data points
        self.Q = 0 # Number of features
        self.d = 0 # Dimension of the response vector
        self.K = 0 # Number of Latent factors
        self.X = None # Data matrix, X is NxQ matrix

        self.mu_true = None # True mean of the response vector, mu is dx1 vector
        self.L_true  = None # True L matrix, L is dxK matrix
        self.D_true  = None # True D matrix, D is dxd matrix
        self.B_true  = None # True B matrix, the matrix of regression weights, B is dxQ matrix
        self.sigma_true = None # True sigma matrix, sigma is a dxd matrix
        self.A_true = None # True A matrix, the matrix of cholesky decomposition, A is dxd matrix
        self.Y = None # Response vector, Y is Nx1 matrix

        self.mu = None # Mean of the response vector, mu is dx1 vector
        self.L = None # L matrix, L is dxK matrix
        self.D = None # D matrix, D is dxd matrix
        self.B = None # B matrix, the matrix of regression weights, B is dxQ matrix
        self.sigma = None # sigma matrix, sigma is a dxd matrix
        self.A = None # A matrix, the matrix of cholesky decomposition, A is dxd matrix

        self.phi = None # phi matrix, phi is 4xd matrix
        #self.M = None # M is Nxd matrix with each row corresponding to each n storing the d dimensional vector m_n 
        #self.S = None # S is Nxdxd matrix with each row corresponding to each n storing the dxd matrix S_n
        #self.mu_n = None # mu_n matrix, mu_n is N x d matrix with each row corresponding to each n storing the d dimensional vector mu_n
        #self.sigma_n = None # sigma_n matrix, sigma_n is Nxdxd matrix with each row corresponding to each n storing the dxd matrix sigma_n

        self.dtype = dtype
        self.seed = np.random.randint(0, 100)
        torch.set_printoptions(precision=precision, profile='full', sci_mode=False)

    def optimizable_parameters(self) -> torch.Tensor:
        '''Return the optimizable parameters'''
        yield self.mu
        yield self.L
        yield self.D
        yield self.B
        yield self.phi

    def generate_data(self, N, Q, d, K, X, mu_true, L_true, D_true, B_true) -> None:
        '''Generate data according to the Poisson-Lognormal distribution'''
        self.N = N # Number of data points
        self.Q = Q # Number of features
        self.d = d # Dimension of the response vector
        self.K = K # Number of Latent factors
        self.X = X.clone().detach().to(self.dtype) # Data matrix, X is NxQ matrix
        self.mu_true = mu_true.clone().detach().to(self.dtype) # True mean of the response vector, mu is dx1 vector
        self.L_true  = L_true.clone().detach().to(self.dtype) # True L matrix, L is dxK matrix
        self.D_true  = D_true.clone().detach().to(self.dtype) # True D matrix, D is dxd matrix
        self.B_true  = B_true.clone().detach().to(self.dtype) # True B matrix, the matrix of regression weights, B is dxQ matrix
        assert self.X.shape       == Size([self.N, self.Q])
        assert self.mu_true.shape == Size([self.d, ])
        assert self.L_true.shape  == Size([self.d, self.K])
        assert self.D_true.shape  == Size([self.d, self.d])
        assert self.B_true.shape  == Size([self.d, self.Q]) 
        # sigma_true is semidefinite positive and D_true is a diagonal matrix
        # sigma = LL^T + D
        self.sigma_true = (L_true @ torch.t(L_true)) + D_true # sigma is a dxd matrix
        # Generate random matrices accoriding to eq 7
        self.A_true = torch.linalg.cholesky(self.sigma_true)
        X_t = torch.t(X)
        Y = []
        for i in range(self.N):
            z_i = torch.randn((self.d, ), dtype=self.dtype)
            # generate Poisson-Lognormal data
            mu_yi = torch.exp(self.mu_true + (self.B_true @ (X_t[:, i])) + (self.A_true @ z_i))
            y = torch.poisson(mu_yi).detach().numpy()
            Y.append(y)
        Y = torch.from_numpy(np.vstack(Y)).to(self.dtype)
        self.Y = Y.clone().detach() # Response vector, Y is Nxd matrix

    def set_params(self) -> None:
        '''Set the parameters'''
        print("Setting the parameters ...\n")
        self.mu = torch.randn((self.d, ), dtype=self.dtype, requires_grad=True)
        self.L = torch.randn((self.d, self.K), dtype=self.dtype, requires_grad=True)
        self.D = diag(diag(torch.randint(1, self.d,(self.d, self.d), dtype=self.dtype))).requires_grad_()
        #self.sigma = (self.L @ torch.t(self.L) + self.D)
        #self.A = torch.linalg.cholesky(self.sigma)
        self.B = torch.randn((self.d, self.Q), dtype=self.dtype, requires_grad=True)
        self.phi = torch.randn((self.d, 4), dtype=self.dtype, requires_grad=True) # phi is a dx4 matrix

    def set_train(self, epochs = 10, batch_size=20, max_iter=1000, tol=1e-5, learning_rate=0.01) -> None:
        '''Initialize variables for monitoring of the training'''
        print("\nSetting up for training ...\n")
        self.epochs = epochs
        self.batch_size = batch_size
        self.n_iter = 0
        self.converged = False
        self.max_iter = max_iter
        self.tol = tol
        self.learning_rate = learning_rate
        self.elbo = torch.zeros(1, dtype=self.dtype)
        self.set_params()

    def check_convergence(self) -> bool:
        '''Check if the model has converged'''
        # Check if the parameters have converged
        if self.n_iter > 1:
            if torch.norm(self.mu_history[-1] - self.mu_history[-2]) < self.tol and torch.norm(self.B_history[-1] - self.B_history[-2]) < self.tol and torch.norm(self.L_history[-1] - self.L_history[-2]) < self.tol and torch.norm(self.D_history[-1] - self.D_history[-2]) < self.tol:
                print(f"\nThe parameters have converged succesfully!\n")
                self.converged = True
                return True
            else:
                return False
        else:
            return False

    def sample_zn(self, n) -> torch.Tensor:
        '''Sample from z_n where z_n ~ N(mu_n, sigma_n)
        where mu_n is a dx1 vector and sigma_n is a dxd matrix
        sigma_n = (S_n + sigma^{-1})^{-1}
        m_n = sigma_n^{-1} * (S_n * m_n + sigma^{-1} * (mu + B*x_n))'''
        eps0 = torch.distributions.MultivariateNormal(torch.zeros(self.d), torch.eye(self.d)).sample().to(self.dtype)
        eps1 = torch.distributions.MultivariateNormal(torch.zeros(self.K), torch.eye(self.K)).sample().to(self.dtype)
        eps2 = torch.distributions.MultivariateNormal(torch.zeros(self.d), torch.eye(self.d)).sample().to(self.dtype)
        eps1_prime = torch.mv(self.L, eps1) + torch.mv(self.D, eps0) # Normally distributed with mean 0 and covariance sigma
        eps2_prime = torch.mv(diag(torch.reciprocal(torch.sqrt(diag(self.create_S_n(n))))), eps2) # Normally distributed with mean 0 and S_n^{-1}
        zn_s = self.mu_n_func(n) + eps1_prime - (self.L @ torch.t(self.L) + self.D) @ inverse(inverse(self.create_S_n(n)) + (self.L @ torch.t(self.L) + self.D)) @ (eps1_prime + eps2_prime)
        return zn_s # zn_s is a dx1 vector

    def id_tensor_list(self, l):
        '''Function that creates a list of tensors of the form 
        [ [1,0,0,0..], [0,1,0,0..], [0,0,1,0..], [0,0,1,0..], ... [0,0,0,..1] ]'''
        id_tensor_list = []
        for i in range(l):
            v = torch.zeros(self.d)
            v[i] = 1
            id_tensor_list.append(v)
        return id_tensor_list
    
    def create_m_n(self, n) -> torch.Tensor:
        '''Create the m_n vector'''
        v = self.id_tensor_list(self.d)
        m_n = torch.zeros(self.d, )
        for i in range(self.d):
            m_n = m_n + (torch.mul(v[i], self.phi[i][0] * torch.log(torch.exp(self.phi[i][1]) + self.Y[n][i])))
        return m_n

    def create_S_n(self, n) -> torch.Tensor:
        '''Create the S_n matrix'''
        v = self.id_tensor_list(self.d)
        S_n = torch.zeros(self.d, self.d)
        for i in range(self.d):
            S_n = S_n + (torch.mul(torch.diag(v[i]), torch.exp(self.phi[i][2] * torch.log(torch.exp(self.phi[i][3]) + self.Y[n][i]))))
        return S_n
    
    def sigma_n_func(self, n) -> torch.Tensor:
        '''Compute Sigma_n'''
        return inverse(self.create_S_n(n) + inverse((self.L @ torch.t(self.L) + self.D)))
    
    def mu_n_func(self, n) -> torch.Tensor:
        return inverse(self.sigma_n_func(n)) @ (self.create_S_n(n) @ self.create_m_n(n) + inverse((self.L @ torch.t(self.L) + self.D)) @ (self.mu + self.B @ self.X[n]))

    def poisson_log_likelihood(self, y, z) -> torch.Tensor:
        '''Compute the log likelihood of the data given the poisson model'''
        # Compute the log likelihood of the data y given the poisson model with the mean parameter as exp^z
        # returns yz - exp(z) - log(y!), we skip the log(y!) term since it is constant 
        return torch.mul(y, z) - torch.exp(z)
    
    def normal_log_likelihood(self, y, mu, sigma) -> torch.Tensor:
        '''Compute the log likelihood of the data given the normal model'''
        # Compute the log likelihood of the data y given the normal model with the mean parameter as mu and covariance as sigma
        # returns -0.5 * ((y - mu) / sigma)^2 - 0.5 * log(2 * pi * sigma)
        return -0.5 * torch.square(torch.div((y - mu) , sigma)) - torch.log(sigma)

    def get_elbo(self) -> torch.Tensor:
        '''Compute the evidence lower bound'''
        # Compute the evidence lower bound
        for n in range(self.N):
            z_n = self.sample_zn(n)
            for s in range(self.batch_size):
                for i in range(self.d):
                    self.elbo = self.elbo + self.poisson_log_likelihood(self.Y[n][i], z_n[i]) - self.normal_log_likelihood(z_n[i], self.mu_n_func(n)[i], torch.reciprocal(self.create_S_n(n)[i][i]))
            self.elbo /= self.batch_size
        return self.elbo
    
    def train(self) -> None:
        '''Start training the model'''
        optimizer = optim.Adam(self.optimizable_parameters(), lr=self.learning_rate)
        print("Starting training: \n")
        while self.n_iter < 20:#self.max_iter:# and not self.converged:
            print("Iteration: " + str(self.n_iter))
            self.n_iter += 1
            optimizer.zero_grad()
            overall_loss = -self.get_elbo()
            overall_loss.sum().backward(retain_graph=True)
            optimizer.step()
            self.converged = self.check_convergence()

if __name__ == '__main__':
    # Reproducability
    torch.manual_seed(0)
    random.seed(0)
    np.random.seed(0)

    # Set the model parameters
    N = 10
    Q = 4
    d = 3
    K = 2
    X = torch.randn(N, Q)
    mu = torch.randn((d, ))
    L = torch.randn((d, K))
    D = diag(diag(torch.randint(1, d, (d, d))))
    B = torch.randn((d, Q))

    # Create the model class
    model = Count_SVI()
    model.generate_data(N, Q, d, K, X, mu, L, D, B)
    model.set_train()
    model.train()

Answer 1

The operations like S_n = S_n + something... are called in-place operations that are not allowed while calculating the gradients using the autograd function of pytorch.像S_n = S_n + something...这样的操作称为就地操作，在使用 pytorch 的 autograd function 计算梯度时，这些操作是不允许的。

无法找出 pytorch 中的就地操作错误

问题描述

1 个解决方案

解决方案1
1 2022-08-28 13:25:17

无法找出 pytorch 中的就地操作错误

问题描述

1 个解决方案

解决方案1 1 2022-08-28 13:25:17

解决方案1
1 2022-08-28 13:25:17