[英]Issues with building up an experience buffer in Pytorch due to "inplace operation"
到目前為止,我主要在 Tensorflow 中創建強化學習模型后,我正在玩 Pytorch。 現在,當我想將“體驗緩沖區”又名“重播緩沖區”集成到我原本可以正常運行的代碼中時,我遇到了問題。 我已經為我的問題建立了一個簡單的工作示例,希望你們中的一個人可以向我解釋我在 Pytorch 中遺漏/不完全理解的內容以及我如何解決該問題:
我使用 OpenAi Gym Cart Pole 環境來說明我的問題。
# Load Dependencies
import gym
import numpy as np
import torch as t
import torch.nn as nn
import torch.nn.functional as f
import collections
env = gym.make("CartPole-v1") # Create the environment
buffer_cap = 100
class Network(nn.Module):
def __init__(self):
super(Network, self).__init__()
self.dense1 = nn.Linear(4, 128)
self.action = nn.Linear(128, 2)
self.critic = nn.Linear(128, 1)
def forward(self,x):
x = self.dense1(x)
x = f.relu(x)
act = self.action(x)
act = f.softmax(act, dim = 0)
crt = self.critic(x)
return act, crt
def ActorLoss(self,log_prob, ret, value):
ret = ret.unsqueeze(0)
diff = ret - value
return - log_prob * diff
#Experience Buffer
class ExpBuffer:
def __init__(self, capacity):
self.buffer = collections.deque(maxlen=capacity)
def append(self, experience):
self.buffer.append(experience)
def __len__(self): #function that we can call "len()" of the instance
return len(self.buffer)
def sample(self, batch_size):
#select multiple experiences to form an uncorrelated batch
indices = np.random.choice(len(self.buffer), batch_size, replace = False)
#zip(*zipped) unzippes a zipped input
log_probs, values, rewards = zip(*[self.buffer[idx] for idx in indices])
return zip(log_probs, values, rewards)
class Worker():
def __init__(self):
self.net = Network()
self.optimizer = t.optim.Adam(self.net.parameters(), lr = 0.02)
self.ExpBuffer = ExpBuffer(buffer_cap)
self.Experience = collections.namedtuple('Experience', "log_probs value reward")
def fillExperience(self):
#Fill up experience buffer until cap is reached
while len(self.ExpBuffer) < buffer_cap:
action_probs_hist = []
critic_value_hist = []
rewards_hist = []
state_next = env.reset()
done = False
while not done:
state = t.tensor(state_next, dtype = t.float) #convert numpy output to tensor
action_probs, critic_value = self.net.forward(state)
action = t.multinomial(action_probs,1) #Select action with given probability
state_next, reward, done, _ = env.step(int(action)) #next step
action_probs_hist.append(action_probs[action])
critic_value_hist.append(critic_value)
rewards_hist.append(reward)
#Fill up experience buffer
for log_prob, val,rew in zip(action_probs_hist,critic_value_hist,rewards_hist) :
exp = self.Experience(log_prob, val, rew)
self.ExpBuffer.append(exp)
def train(self):
#Training shall be nothing else in that exanmple
#than sampling twice from the batch and backpropagate
for i in range(2): #Simply sample two times and backpropagate
loss = []
batch = self.ExpBuffer.sample(5)
for log_prob, val, ret in batch:
loss.append(self.net.ActorLoss(log_prob, val, ret))
# Backpropagation steps
loss_value = sum(loss)
self.optimizer.zero_grad()
loss_value.backward()
self.optimizer.step()
if __name__ == "__main__":
w = Worker()
w.fillExperience()
w.train()
所以步驟是:
在實際代碼中,更新了 .network 並創建和存儲了新的體驗。
在第二輪采樣中,當我到達loss.backward()
時,出現以下錯誤:
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [128, 1]], which is output 0 of AsStridedBackward0, is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
我已經應用t.autograd.set_detect_anomaly(True)
至少獲得了更多信息,但它沒有幫助。 我試着用谷歌搜索並理解它,但我沒有。
任何人都會對這個問題有所了解。 幕后發生了什么,我能做些什么來解決它。
謝謝!
KO4all 我遇到了和你一樣的問題。 我的錯誤是因為(我想從現在開始)我將 LSTM 引入了一個有效的 PPO 連續算法中。 我有一個像你的代碼一樣的操作:
log_probs, values, rewards = zip(*[self.buffer[idx] for idx in indices])
對我來說,我使用 for-loop 來持續計算值。 但我不知道如何修復它。 我希望有人能幫助你,也許這個解決方案也適用於我。
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.