用于 Cartpole-v0 的 PyTorch PPO 实现陷入局部最优

Question

I have implemented PPO for Cartpole-VO environment.我已经为 Cartpole-VO 环境实施了 PPO。 However, it does not converge in certain iterations of the game.但是，它不会在游戏的某些迭代中收敛。 Sometimes it gets stuck in local optima.有时它会陷入局部最优。 I have implemented the algorithm using the TD-0 advantage ie我已经使用 TD-0 优势实现了算法，即

A(s_t) = R(t+1) + \gamma V(S_{t+1}) - V(S_t)

Here is my code:这是我的代码：

def running_average(x, n):
    N = n
    kernel = np.ones(N)
    conv_len = x.shape[0]-N
    y = np.zeros(conv_len)
    for i in range(conv_len):
        y[i] = kernel @ x[i:i+N] # matrix multiplication operator: np.mul
        y[i] /= N
    return y



class ActorNetwork(nn.Module):
    def __init__(self, state_dim, n_actions, learning_rate=0.0003, epsilon_clipping=0.3, update_epochs=10):
        super().__init__()
        self.n_actions = n_actions
        self.model = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, n_actions),
            nn.Softmax(dim=-1)
        ).float()
        self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)
        self.epsilon_clipping = epsilon_clipping
        self.update_epochs = update_epochs

    def forward(self, X):
        return self.model(X)

    
    def predict(self, state):
        if state.ndim < 2:
            action_probs = self.model(torch.FloatTensor(state).unsqueeze(0).float())
        else: 
            action_probs = self.model(torch.FloatTensor(state))

        return action_probs.squeeze(0).data.numpy()

   
    def update(self, states, actions, deltas, old_prob):
  
        batch_size = len(states)
        state_batch = torch.Tensor(states)
        action_batch = torch.Tensor(actions)
        delta_batch = torch.Tensor(deltas)
        old_prob_batch = torch.Tensor(old_prob)
        for k in range(self.update_epochs):
            pred_batch = self.model(state_batch)

            prob_batch = pred_batch.gather(dim=1, index=action_batch.long().view(-1, 1)).squeeze()

            ratio = torch.exp(torch.log(prob_batch) - torch.log(old_prob_batch))

            clipped = torch.clamp(ratio, 1 - self.epsilon_clipping, 1 + self.epsilon_clipping) * delta_batch
            loss_r = -torch.min(ratio*delta_batch, clipped)
            loss = torch.mean(loss_r)
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()




class CriticNetwork(nn.Module):
    def __init__(self, state_dim, learning_rate=0.001):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
        ).float()
        self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)


    def forward(self, X):
        return self.model(X)

    def predict(self, state):
        if state.ndim < 2:
            values = self.model(torch.FloatTensor(state).unsqueeze(0).float())
        else:
            values = self.model(torch.FloatTensor(state))

        return values.data.numpy()

  
    def update(self, states, targets):
        
        state_batch = torch.Tensor(states)
        target_batch = torch.Tensor(targets)
        pred_batch = self.model(state_batch)
        loss = torch.nn.functional.mse_loss(pred_batch, target_batch.unsqueeze(1))
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    


def train_ppo_agent(env, episode_length, max_episodes, gamma, visualize_step, learning_rate_actor=0.0003, learning_rate_critic=0.001, epsilon_clipping=0.2, actor_update_epochs=10):

  
    model_actor = ActorNetwork(env.observation_space.shape[0], env.action_space.n, learning_rate=learning_rate_actor,
                               epsilon_clipping=epsilon_clipping, update_epochs=actor_update_epochs)
    model_critic = CriticNetwork(env.observation_space.shape[0], learning_rate=learning_rate_critic)



    EPISODE_LENGTH = episode_length
    MAX_EPISODES = max_episodes
    GAMMA = gamma
    VISUALIZE_STEP = max(1, visualize_step)
    score = []


    for episode in range(MAX_EPISODES):
        curr_state = env.reset()
        done = False
        all_episode_t = []
        score_episode = 0
        for t in range(EPISODE_LENGTH):
            act_prob = model_actor.predict(curr_state)
            action = np.random.choice(np.array(list(range(env.action_space.n))), p=act_prob)
            value = model_critic.predict(curr_state)
            prev_state = curr_state
            curr_state, reward, done, info = env.step(action)
            score_episode += reward
            e_t = {'state': prev_state, 'action':action, 'action_prob':act_prob[action],'reward': reward, 'value': value}
            all_episode_t.append(e_t)
            if done:
                break
        score.append(score_episode)

        episode_values = [all_episode_t[t]['value'] for t in range(len(all_episode_t))]
        next_state_estimates = [episode_values[i].item() for i in range(1, len(episode_values))]
        next_state_estimates.append(0)
        boostrap_estimate = []
        for t in range(len(all_episode_t)):
            G = all_episode_t[t]['reward'] + GAMMA * next_state_estimates[t]
            boostrap_estimate.append(G)

        episode_target = np.array(boostrap_estimate)
        episode_values = np.array(episode_values)
        # compute the advantage for each state in the episode: R_{t+1} + \gamma * V(S_{t+1}) - V_{t}
        adv_batch = episode_target-episode_values
       
        state_batch = np.array([all_episode_t[t]['state'] for t in range(len(all_episode_t))])
        action_batch = np.array([all_episode_t[t]['action'] for t in range(len(all_episode_t))])
        old_actor_prob = np.array([all_episode_t[t]['action_prob'] for t in range(len(all_episode_t))])
       
        model_actor.update(state_batch, action_batch, adv_batch, old_actor_prob)
       
        model_critic.update(state_batch, episode_target)

        # print the status after every VISUALIZE_STEP episodes
        if episode % VISUALIZE_STEP == 0 and episode > 0:
            print('Episode {}\tAverage Score: {:.2f}'.format(episode, np.mean(score[-VISUALIZE_STEP:-1])))
            # domain knowledge applied to stop training: if the average score across last 100 episodes is greater than 195, game is solved
            if np.mean(score[-100:-1]) > 195:
                break


    # Training plot: Episodic reward over Training Episodes
    score = np.array(score)
    avg_score = running_average(score, visualize_step)
    plt.figure(figsize=(15, 7))
    plt.ylabel("Episodic Reward", fontsize=12)
    plt.xlabel("Training Episodes", fontsize=12)
    plt.plot(score, color='gray', linewidth=1)
    plt.plot(avg_score, color='blue', linewidth=3)
    plt.scatter(np.arange(score.shape[0]), score, color='green', linewidth=0.3)
    plt.savefig("temp/cartpole_ppo_training_plot.pdf")

    # return the trained models
    return model_actor, model_critic

def main():
    env = gym.make('CartPole-v0')
    episode_length = 300
    n_episodes = 5000
    gamma = 0.99
    vis_steps = 100
    learning_rate_actor = 0.0003
    actor_update_epochs = 10
    epsilon_clipping = 0.2
    learning_rate_critic = 0.001
   
    # train the PPO agent
    model_actor, model_critic = train_ppo_agent(env, episode_length, n_episodes, gamma, vis_steps,
                                               learning_rate_actor=learning_rate_actor,
                                               learning_rate_critic=learning_rate_critic,
                                               epsilon_clipping=epsilon_clipping,
                                               actor_update_epochs=actor_update_epochs)

Am I missing something, or is this kind of behaviour expected if one uses simple TD-0 advantages for PPO, given the nature of the Cartpole environment?考虑到 Cartpole 环境的性质，我是否遗漏了什么，或者如果一个人使用简单的 TD-0 优势来实现 PPO，这种行为是预期的吗？

Answer 1

If you remove the "-" (the negative marker) in line:如果删除行中的“-”（负标记）：

loss_r = -torch.min(ratio*delta_batch, clipped)

The score will then start to steadily increase over time.然后分数将随着时间的推移开始稳步增加。 Before this fix you had negative loss which would increase over time.在此修复之前，您的负损失会随着时间的推移而增加。 This is not how loss should work for neural networks.这不是神经网络的损失应该如何工作。 As gradient descent works to minimize the loss.由于梯度下降可以最大限度地减少损失。 So you want a positive loss which can be minimized by optimizer.所以你想要一个可以被优化器最小化的正损失。

Hope my answer is somewhat clear, and sorry I cannot go into deeper detail.希望我的回答有点清楚，对不起，我不能 go 更深入的细节。

My run can be seen in the attached image:我的跑步可以在附图中看到：

用于 Cartpole-v0 的 PyTorch PPO 实现陷入局部最优

问题描述

1 个解决方案

解决方案1
0 2021-12-10 13:41:23

用于 Cartpole-v0 的 PyTorch PPO 实现陷入局部最优

问题描述

1 个解决方案

解决方案1 0 2021-12-10 13:41:23

解决方案1
0 2021-12-10 13:41:23