[英]Unable to figure out the In place operation error in pytorch
I have written the following code and I cannot seem to find where is the in place operation error.我已经编写了以下代码,但我似乎无法找到就地操作错误在哪里。
The error that I am getting is我得到的错误是
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor []], which is output 0 of AsStridedBackward0, is at version 1; expected version 0 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
After setting anomaly detection, I am being pointed to this snippet of code设置异常检测后,我被指向这段代码
def create_S_n(self, n) -> torch.Tensor:
'''Create the S_n matrix'''
v = self.id_tensor_list(self.d)
S_n = torch.zeros(self.d, self.d)
for i in range(self.d):
S_n = S_n + (torch.mul(torch.diag(v[i]), torch.exp(self.phi[i][2] * torch.log(torch.exp(self.phi[i][3]) + self.Y[n][i]))))
return S_n
and the line和线
S_n = S_n + (torch.mul(torch.diag(v[i]), torch.exp(self.phi[i][2] * torch.log(torch.exp(self.phi[i][3]) + self.Y[n][i]))))
Can someone tell me where is the in place operation?有人可以告诉我就地操作在哪里吗? and how do I overcome it?
我该如何克服它?
Here is my entire code for reproducability:这是我的整个可重现性代码:
import random
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import scipy.stats as stats
import torch
from torch import tensor
from torch import diag
from torch import Size
from torch import inverse
import torch.optim as optim
torch.autograd.set_detect_anomaly(True)
class Count_SVI:
'''A class that implements SVI'''
def __init__(self, precision = 4, dtype=torch.float32):
'''Initialize the model'''
self.N = 0 # Number of data points
self.Q = 0 # Number of features
self.d = 0 # Dimension of the response vector
self.K = 0 # Number of Latent factors
self.X = None # Data matrix, X is NxQ matrix
self.mu_true = None # True mean of the response vector, mu is dx1 vector
self.L_true = None # True L matrix, L is dxK matrix
self.D_true = None # True D matrix, D is dxd matrix
self.B_true = None # True B matrix, the matrix of regression weights, B is dxQ matrix
self.sigma_true = None # True sigma matrix, sigma is a dxd matrix
self.A_true = None # True A matrix, the matrix of cholesky decomposition, A is dxd matrix
self.Y = None # Response vector, Y is Nx1 matrix
self.mu = None # Mean of the response vector, mu is dx1 vector
self.L = None # L matrix, L is dxK matrix
self.D = None # D matrix, D is dxd matrix
self.B = None # B matrix, the matrix of regression weights, B is dxQ matrix
self.sigma = None # sigma matrix, sigma is a dxd matrix
self.A = None # A matrix, the matrix of cholesky decomposition, A is dxd matrix
self.phi = None # phi matrix, phi is 4xd matrix
#self.M = None # M is Nxd matrix with each row corresponding to each n storing the d dimensional vector m_n
#self.S = None # S is Nxdxd matrix with each row corresponding to each n storing the dxd matrix S_n
#self.mu_n = None # mu_n matrix, mu_n is N x d matrix with each row corresponding to each n storing the d dimensional vector mu_n
#self.sigma_n = None # sigma_n matrix, sigma_n is Nxdxd matrix with each row corresponding to each n storing the dxd matrix sigma_n
self.dtype = dtype
self.seed = np.random.randint(0, 100)
torch.set_printoptions(precision=precision, profile='full', sci_mode=False)
def optimizable_parameters(self) -> torch.Tensor:
'''Return the optimizable parameters'''
yield self.mu
yield self.L
yield self.D
yield self.B
yield self.phi
def generate_data(self, N, Q, d, K, X, mu_true, L_true, D_true, B_true) -> None:
'''Generate data according to the Poisson-Lognormal distribution'''
self.N = N # Number of data points
self.Q = Q # Number of features
self.d = d # Dimension of the response vector
self.K = K # Number of Latent factors
self.X = X.clone().detach().to(self.dtype) # Data matrix, X is NxQ matrix
self.mu_true = mu_true.clone().detach().to(self.dtype) # True mean of the response vector, mu is dx1 vector
self.L_true = L_true.clone().detach().to(self.dtype) # True L matrix, L is dxK matrix
self.D_true = D_true.clone().detach().to(self.dtype) # True D matrix, D is dxd matrix
self.B_true = B_true.clone().detach().to(self.dtype) # True B matrix, the matrix of regression weights, B is dxQ matrix
assert self.X.shape == Size([self.N, self.Q])
assert self.mu_true.shape == Size([self.d, ])
assert self.L_true.shape == Size([self.d, self.K])
assert self.D_true.shape == Size([self.d, self.d])
assert self.B_true.shape == Size([self.d, self.Q])
# sigma_true is semidefinite positive and D_true is a diagonal matrix
# sigma = LL^T + D
self.sigma_true = (L_true @ torch.t(L_true)) + D_true # sigma is a dxd matrix
# Generate random matrices accoriding to eq 7
self.A_true = torch.linalg.cholesky(self.sigma_true)
X_t = torch.t(X)
Y = []
for i in range(self.N):
z_i = torch.randn((self.d, ), dtype=self.dtype)
# generate Poisson-Lognormal data
mu_yi = torch.exp(self.mu_true + (self.B_true @ (X_t[:, i])) + (self.A_true @ z_i))
y = torch.poisson(mu_yi).detach().numpy()
Y.append(y)
Y = torch.from_numpy(np.vstack(Y)).to(self.dtype)
self.Y = Y.clone().detach() # Response vector, Y is Nxd matrix
def set_params(self) -> None:
'''Set the parameters'''
print("Setting the parameters ...\n")
self.mu = torch.randn((self.d, ), dtype=self.dtype, requires_grad=True)
self.L = torch.randn((self.d, self.K), dtype=self.dtype, requires_grad=True)
self.D = diag(diag(torch.randint(1, self.d,(self.d, self.d), dtype=self.dtype))).requires_grad_()
#self.sigma = (self.L @ torch.t(self.L) + self.D)
#self.A = torch.linalg.cholesky(self.sigma)
self.B = torch.randn((self.d, self.Q), dtype=self.dtype, requires_grad=True)
self.phi = torch.randn((self.d, 4), dtype=self.dtype, requires_grad=True) # phi is a dx4 matrix
def set_train(self, epochs = 10, batch_size=20, max_iter=1000, tol=1e-5, learning_rate=0.01) -> None:
'''Initialize variables for monitoring of the training'''
print("\nSetting up for training ...\n")
self.epochs = epochs
self.batch_size = batch_size
self.n_iter = 0
self.converged = False
self.max_iter = max_iter
self.tol = tol
self.learning_rate = learning_rate
self.elbo = torch.zeros(1, dtype=self.dtype)
self.set_params()
def check_convergence(self) -> bool:
'''Check if the model has converged'''
# Check if the parameters have converged
if self.n_iter > 1:
if torch.norm(self.mu_history[-1] - self.mu_history[-2]) < self.tol and torch.norm(self.B_history[-1] - self.B_history[-2]) < self.tol and torch.norm(self.L_history[-1] - self.L_history[-2]) < self.tol and torch.norm(self.D_history[-1] - self.D_history[-2]) < self.tol:
print(f"\nThe parameters have converged succesfully!\n")
self.converged = True
return True
else:
return False
else:
return False
def sample_zn(self, n) -> torch.Tensor:
'''Sample from z_n where z_n ~ N(mu_n, sigma_n)
where mu_n is a dx1 vector and sigma_n is a dxd matrix
sigma_n = (S_n + sigma^{-1})^{-1}
m_n = sigma_n^{-1} * (S_n * m_n + sigma^{-1} * (mu + B*x_n))'''
eps0 = torch.distributions.MultivariateNormal(torch.zeros(self.d), torch.eye(self.d)).sample().to(self.dtype)
eps1 = torch.distributions.MultivariateNormal(torch.zeros(self.K), torch.eye(self.K)).sample().to(self.dtype)
eps2 = torch.distributions.MultivariateNormal(torch.zeros(self.d), torch.eye(self.d)).sample().to(self.dtype)
eps1_prime = torch.mv(self.L, eps1) + torch.mv(self.D, eps0) # Normally distributed with mean 0 and covariance sigma
eps2_prime = torch.mv(diag(torch.reciprocal(torch.sqrt(diag(self.create_S_n(n))))), eps2) # Normally distributed with mean 0 and S_n^{-1}
zn_s = self.mu_n_func(n) + eps1_prime - (self.L @ torch.t(self.L) + self.D) @ inverse(inverse(self.create_S_n(n)) + (self.L @ torch.t(self.L) + self.D)) @ (eps1_prime + eps2_prime)
return zn_s # zn_s is a dx1 vector
def id_tensor_list(self, l):
'''Function that creates a list of tensors of the form
[ [1,0,0,0..], [0,1,0,0..], [0,0,1,0..], [0,0,1,0..], ... [0,0,0,..1] ]'''
id_tensor_list = []
for i in range(l):
v = torch.zeros(self.d)
v[i] = 1
id_tensor_list.append(v)
return id_tensor_list
def create_m_n(self, n) -> torch.Tensor:
'''Create the m_n vector'''
v = self.id_tensor_list(self.d)
m_n = torch.zeros(self.d, )
for i in range(self.d):
m_n = m_n + (torch.mul(v[i], self.phi[i][0] * torch.log(torch.exp(self.phi[i][1]) + self.Y[n][i])))
return m_n
def create_S_n(self, n) -> torch.Tensor:
'''Create the S_n matrix'''
v = self.id_tensor_list(self.d)
S_n = torch.zeros(self.d, self.d)
for i in range(self.d):
S_n = S_n + (torch.mul(torch.diag(v[i]), torch.exp(self.phi[i][2] * torch.log(torch.exp(self.phi[i][3]) + self.Y[n][i]))))
return S_n
def sigma_n_func(self, n) -> torch.Tensor:
'''Compute Sigma_n'''
return inverse(self.create_S_n(n) + inverse((self.L @ torch.t(self.L) + self.D)))
def mu_n_func(self, n) -> torch.Tensor:
return inverse(self.sigma_n_func(n)) @ (self.create_S_n(n) @ self.create_m_n(n) + inverse((self.L @ torch.t(self.L) + self.D)) @ (self.mu + self.B @ self.X[n]))
def poisson_log_likelihood(self, y, z) -> torch.Tensor:
'''Compute the log likelihood of the data given the poisson model'''
# Compute the log likelihood of the data y given the poisson model with the mean parameter as exp^z
# returns yz - exp(z) - log(y!), we skip the log(y!) term since it is constant
return torch.mul(y, z) - torch.exp(z)
def normal_log_likelihood(self, y, mu, sigma) -> torch.Tensor:
'''Compute the log likelihood of the data given the normal model'''
# Compute the log likelihood of the data y given the normal model with the mean parameter as mu and covariance as sigma
# returns -0.5 * ((y - mu) / sigma)^2 - 0.5 * log(2 * pi * sigma)
return -0.5 * torch.square(torch.div((y - mu) , sigma)) - torch.log(sigma)
def get_elbo(self) -> torch.Tensor:
'''Compute the evidence lower bound'''
# Compute the evidence lower bound
for n in range(self.N):
z_n = self.sample_zn(n)
for s in range(self.batch_size):
for i in range(self.d):
self.elbo = self.elbo + self.poisson_log_likelihood(self.Y[n][i], z_n[i]) - self.normal_log_likelihood(z_n[i], self.mu_n_func(n)[i], torch.reciprocal(self.create_S_n(n)[i][i]))
self.elbo /= self.batch_size
return self.elbo
def train(self) -> None:
'''Start training the model'''
optimizer = optim.Adam(self.optimizable_parameters(), lr=self.learning_rate)
print("Starting training: \n")
while self.n_iter < 20:#self.max_iter:# and not self.converged:
print("Iteration: " + str(self.n_iter))
self.n_iter += 1
optimizer.zero_grad()
overall_loss = -self.get_elbo()
overall_loss.sum().backward(retain_graph=True)
optimizer.step()
self.converged = self.check_convergence()
if __name__ == '__main__':
# Reproducability
torch.manual_seed(0)
random.seed(0)
np.random.seed(0)
# Set the model parameters
N = 10
Q = 4
d = 3
K = 2
X = torch.randn(N, Q)
mu = torch.randn((d, ))
L = torch.randn((d, K))
D = diag(diag(torch.randint(1, d, (d, d))))
B = torch.randn((d, Q))
# Create the model class
model = Count_SVI()
model.generate_data(N, Q, d, K, X, mu, L, D, B)
model.set_train()
model.train()
The operations like S_n = S_n + something...
are called in-place operations that are not allowed while calculating the gradients using the autograd function of pytorch.像
S_n = S_n + something...
这样的操作称为就地操作,在使用 pytorch 的 autograd function 计算梯度时,这些操作是不允许的。
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.