[英]Problem with output of neural network in a cross-entropy method attempt at solving CartPole-v0
I am trying to implement the cross-entropy policy-based method to the classic CartPole-v0 environment.我正在尝试对经典的 CartPole-v0 环境实施基于交叉熵策略的方法。 I am actually reformatting a working implementation of this algorithm on the MountainCarContinuous-v0, but when I try to get the agent learning, I get this error message:我实际上是在 MountainCarContinuous-v0 上重新格式化该算法的工作实现,但是当我尝试让代理学习时,我收到以下错误消息:
---------------------------------------------------------------------------
AssertionError Traceback (most recent call last)
in
4
5 agent = Agent(env)
----> 6 scores = agent.learn()
7
8 # plot the scores
~/cross_entropy.py in learn(self, n_iterations, max_t, gamma, print_every, pop_size, elite_frac, sigma)
83 for i_iteration in range(1, n_iterations+1): # loop over all the training iterations
84 weights_pop = [best_weight + (sigma*np.random.randn(self.get_weights_dim())) for i in range(pop_size)] # population of the weights/policies
---> 85 rewards = np.array([self.evaluate(weights, gamma, max_t) for weights in weights_pop]) # rewards from the policies resulting from all individual weights
86
87 # get the best policies
~/cross_entropy.py in (.0)
83 for i_iteration in range(1, n_iterations+1): # loop over all the training iterations
84 weights_pop = [best_weight + (sigma*np.random.randn(self.get_weights_dim())) for i in range(pop_size)] # population of the weights/policies
---> 85 rewards = np.array([self.evaluate(weights, gamma, max_t) for weights in weights_pop]) # rewards from the policies resulting from all individual weights
86
87 # get the best policies
~/cross_entropy.py in evaluate(self, weights, gamma, max_t)
56 action = self.forward(state)
57 #action = torch.argmax(action_vals).item()
---> 58 state, reward, done, _ = self.env.step(action)
59 episode_return += reward * math.pow(gamma, t)
60 if done:
/gym/wrappers/time_limit.py in step(self, action)
14 def step(self, action):
15 assert self._elapsed_steps is not None, "Cannot call env.step() before calling reset()"
---> 16 observation, reward, done, info = self.env.step(action)
17 self._elapsed_steps += 1
18 if self._elapsed_steps >= self._max_episode_steps:
/gym/envs/classic_control/cartpole.py in step(self, action)
102 def step(self, action):
103 err_msg = "%r (%s) invalid" % (action, type(action))
--> 104 assert self.action_space.contains(action), err_msg
105
106 x, x_dot, theta, theta_dot = self.state
AssertionError: tensor([ 0.3987, 0.6013]) () invalid
I found this is because the MountainCarContinuous-v0 environment has an action_space of type Box(2) whereas CartPole-v0 is Discrete(2), meaning that I only want an integer as action selection.我发现这是因为 MountainCarContinuous-v0 环境有一个 Box(2) 类型的 action_space,而 CartPole-v0 是 Discrete(2),这意味着我只想要一个 integer 作为动作选择。
I have tried working around this notion by applying a softmax activation function and then took the index of the higher value as the action.我尝试通过应用 softmax 激活 function 来解决这个概念,然后将较高值的索引作为操作。
action_vals = self.forward(state)
action = torch.argmax(action_vals).item()
This gets rid of the error but when I train the agent, it seems to learn incredibly fast which is kind of an indicator that something is wrong.这消除了错误,但是当我训练代理时,它似乎学得非常快,这表明存在问题。 This is my full agent class:这是我的全权代理 class:
class Agent(nn.Module):
def __init__(self, env, h_size=16):
super().__init__()
self.env = env
# state, hidden layer, action sizes
self.s_size = env.observation_space.shape[0]
self.h_size = h_size
self.a_size = env.action_space.n
# define layers
self.fc1 = nn.Linear(self.s_size, self.h_size)
self.fc2 = nn.Linear(self.h_size, self.a_size)
self.device = torch.device('cpu')
def set_weights(self, weights):
s_size = self.s_size
h_size = self.h_size
a_size = self.a_size
# separate the weights for each layer
fc1_end = (s_size*h_size)+h_size
fc1_W = torch.from_numpy(weights[:s_size*h_size].reshape(s_size, h_size))
fc1_b = torch.from_numpy(weights[s_size*h_size:fc1_end])
fc2_W = torch.from_numpy(weights[fc1_end:fc1_end+(h_size*a_size)].reshape(h_size, a_size))
fc2_b = torch.from_numpy(weights[fc1_end+(h_size*a_size):])
# set the weights for each layer
self.fc1.weight.data.copy_(fc1_W.view_as(self.fc1.weight.data))
self.fc1.bias.data.copy_(fc1_b.view_as(self.fc1.bias.data))
self.fc2.weight.data.copy_(fc2_W.view_as(self.fc2.weight.data))
self.fc2.bias.data.copy_(fc2_b.view_as(self.fc2.bias.data))
def get_weights_dim(self):
return (self.s_size+1)*self.h_size + (self.h_size+1)*self.a_size
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.softmax(self.fc2(x))
return x
def evaluate(self, weights, gamma=1.0, max_t=5000):
self.set_weights(weights)
episode_return = 0.0
state = self.env.reset()
for t in range(max_t):
state = torch.from_numpy(state).float().to(self.device)
action_vals = self.forward(state)
action = torch.argmax(action_vals).item()
state, reward, done, _ = self.env.step(action)
episode_return += reward * math.pow(gamma, t)
if done:
break
return episode_return
def learn(self, n_iterations=500, max_t=1000, gamma=1.0, print_every=10, pop_size=50, elite_frac=0.2, sigma=0.5):
"""PyTorch implementation of the cross-entropy method.
Params
======
n_iterations (int): maximum number of training iterations
max_t (int): maximum number of timesteps per episode
gamma (float): discount rate
print_every (int): how often to print average score (over last 100 episodes)
pop_size (int): size of population at each iteration
elite_frac (float): percentage of top performers to use in update
sigma (float): standard deviation of additive noise
"""
n_elite=int(pop_size*elite_frac) # number of elite policies from the population
scores_deque = deque(maxlen=100) # list of the past 100 scores
scores = [] # list of all the scores
best_weight = sigma*np.random.randn(self.get_weights_dim()) # initialize the first best weight randomly
for i_iteration in range(1, n_iterations+1): # loop over all the training iterations
weights_pop = [best_weight + (sigma*np.random.randn(self.get_weights_dim())) for i in range(pop_size)] # population of the weights/policies
rewards = np.array([self.evaluate(weights, gamma, max_t) for weights in weights_pop]) # rewards from the policies resulting from all individual weights
# get the best policies
##
elite_idxs = rewards.argsort()[-n_elite:]
elite_weights = [weights_pop[i] for i in elite_idxs]
##
best_weight = np.array(elite_weights).mean(axis=0) # take the average of the best weights
reward = self.evaluate(best_weight, gamma=1.0) # evaluate this new policy
scores_deque.append(reward) # append the reward
scores.append(reward) # also append the reward
torch.save(self.state_dict(), 'checkpoint.pth') # save the agent
if i_iteration % print_every == 0: # print every 100 steps
print('Episode {}\tAverage Score: {:.2f}'.format(i_iteration, np.mean(scores_deque)))
if np.mean(scores_deque)>=195.0: # print if environment is solved
print('\nEnvironment solved in {:d} iterations!\tAverage Score: {:.2f}'.format(i_iteration-100, np.mean(scores_deque)))
break
return scores
If anyone has an idea on how to get the agent training properly, please give me any suggestions.如果有人对如何正确进行代理培训有任何想法,请给我任何建议。
Turns out all I needed was to add an act() method to the Agent class.事实证明,我只需要向代理 class 添加一个 act() 方法。
def act(self, state):
state = state.unsqueeze(0)
probs = self.forward(state).cpu()
m = Categorical(probs)
action = m.sample()
return action.item()
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.