簡體   English   中英

讓 DQN 學習 CartPole-v1 (PyTorch) 的問題

[英]Problem getting DQN to learn CartPole-v1 (PyTorch)

所以我的 DQN 訓練很好,在 ~65_000 次迭代后解決了環境問題。 但是,我開始研究其他東西,現在它完全壞了,再也無法接近同一水平了。

根據之前工作的建議,我調整了超參數,但仍然沒有看到相同的結果。

import gym
import numpy as np

import torch
from torch import nn
from torch.nn import functional as F
from torch import optim

from models import DQN
from memory import Memory
from utils import wrap_input, epsilon_greedy

def main() -> int:
    env = gym.make("CartPole-v1")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Online and offline model for learning
    model = DQN(env.observation_space, env.action_space, 24).to(device)
    target = DQN(env.observation_space, env.action_space, 24).to(device)
    target.eval()

    # Optimizer and loss function
    optimizer = optim.Adam(model.parameters(), lr=.001)
    loss_fn = F.smooth_l1_loss


    memory = Memory(10_000)
    obs, info = env.reset()

    for it in range(65_000):
        # Do this for the batch norm
        model.eval()

        # Maybe explore
        if np.random.random() <= epsilon_greedy(1.0, .01, 15_000, it):
            state = wrap_input(obs, device).unsqueeze(0)
            action  = model(state).argmax().item()
        else:
            action = env.action_space.sample()

        # Act in environment and store the memory
        next_state, reward, done, truncated, info = env.step(action)
        if truncated or done:
            next_state = np.zeros(env.observation_space.shape)
        memory.store([obs, action, reward, int(done), next_state])
        done = done or truncated

        if done:
            obs, info = env.reset()

        # Train
        if len(memory) > 32:
            model.train()
            states, actions, rewards, dones, next_states = memory.sample(32)

            # Wrap and move all values to the cpu
            states = wrap_input(states, device)
            actions = wrap_input(actions, device, torch.int64, reshape=True)
            next_states = wrap_input(next_states, device)
            rewards = wrap_input(rewards, device, reshape=True)
            dones = wrap_input(dones, device, reshape=True)

            # Get current q-values
            qs = model(states)
            qs = torch.gather(qs, dim=1, index=actions)

            # Compute target q-values
            with torch.no_grad():
                next_qs, _ = target(next_states).max(dim=1)
                next_qs = next_qs.reshape(-1, 1)

            target_qs = rewards + .9 * (1 - dones) * next_qs.reshape(-1, 1)

            # Compute loss
            loss = loss_fn(qs, target_qs)
            optimizer.zero_grad()
            loss.backward()
            
            # Clip gradients
            nn.utils.clip_grad_norm_(model.parameters(), 1)

            # Backprop
            optimizer.step()

            # soft update
            with torch.no_grad():
                for target_param, local_param in zip(target.parameters(), model.parameters()):
                    target_param.data.copy_(1e-2 * local_param.data + (1 - 1e-2) * target_param.data)


        if it % 200 == 0:
            target.load_state_dict(model.state_dict())

# models.py
class FlatExtractor(nn.Module):
    '''Does nothing but pass the input on'''
    def __init__(self, obs_space):
        super(FlatExtractor, self).__init__()

        self.n_flatten = obs_space.shape[0]

    def forward(self, obs):
        return obs


class DQN(nn.Module):
    def __init__(self, obs_space, act_space, layer_size):
        super(DQN, self).__init__()

        # Feature extractor
        if len(obs_space.shape) == 1:
            self.feature_extractor = FlatExtractor(obs_space)
        elif len(obs_space.shape) == 3:
            self.feature_extractor = NatureCnn(obs_space)
        else:
            raise NotImplementedErorr("This type of environment is not supported")

        # Neural network
        self.net = nn.Sequential(
            nn.Linear(self.feature_extractor.n_flatten, layer_size),
            nn.BatchNorm1d(layer_size),
            nn.ReLU(),
            nn.Linear(layer_size, layer_size),
            nn.BatchNorm1d(layer_size),
            nn.ReLU(),
            nn.Linear(layer_size, act_space.n),
        )

    def forward(self, obs):
        return self.net(self.feature_extractor(obs))

# memory.py
import random
from collections import deque

class Memory(object):
    def __init__(self, maxlen):
        self.memory = deque(maxlen=maxlen)

    def store(self, experience):
        self.memory.append(experience)

    def sample(self, n_samples):
        return zip(*random.sample(self.memory, n_samples))

    def __len__(self):
        return len(self.memory)

# utils.py
def wrap_input(arr, device, dtype=torch.float, reshape=False):
    output = torch.from_numpy(np.array(arr)).type(dtype).to(device)
    if reshape:
        output = output.reshape(-1, 1)

    return output

def epsilon_greedy(start, end, n_steps, it):
    return max(start - (start - end) * (it / n_steps), end)

有什么我非常想念的嗎? 我試過訓練更長時間它沒有改變。 最大的問題似乎是損失激增,甚至更改硬更新的 tau 似乎也無法解決這個問題。

我很難讓你的代碼運行,因此我不得不評論幾件事。 我還評論了在調試時增加不必要的復雜性的事情,例如,像 cartpole 這樣的簡單環境不需要 target.network。 此外,更多地關注獲得的總獎勵,而不是損失。

我所做的一些主要更改是 -

  1. 我交換了你的探索和利用代碼
        if np.random.random() <= epsilon_greedy(1.0, .01, 15_000, it):
            state = wrap_input(obs, device).unsqueeze(0)
            action  = model(state).argmax().item()
        else:
            action = env.action_space.sample()

您的代碼基本上通過獲取 argmax 開始利用,一旦 epsilon 值足夠低,它就會開始隨機采樣。 這個需要換。

我將其替換為 -

        if np.random.random() <= epsilon_greedy(1.0, .01, 15_000, it):
            action = env.action_space.sample()
            

            

        else:
            state = wrap_input(obs, device).unsqueeze(0)
            action  = model(state).argmax().item()
  1. 我增加了你的批量大小。 cartpole 中更大的批量大小,大大加快了培訓速度 -
states, actions, rewards, dones, next_states = memory.sample(128)
  1. 此外,最好等待您的 model 獲得足夠的經驗后再開始訓練 -
        if len(memory) > 500:
            model.train()
            states, actions, rewards, dones, next_states = memory.sample(128)

我所做的其他更改是為了簡化調試。

  1. 我沒有看到class FlatExtractor(nn.Module)的任何用途,因此我將其刪除並進行了以下更改 -
        if len(obs_space.shape) == 1:
            self.feature_extractor = env.observation_space.shape[0]
    def forward(self, obs):

        return self.net(obs)
  1. 我刪除了 BatchNorm 的所有實例

  2. 用 MSELoss 替換損失並刪除剪輯梯度

loss_fn = nn.MSELoss()
  1. 將學習率更改為lr=.0001

  2. 增加你的 neural.network 的寬度 -

model = DQN(env.observation_space, env.action_space, 128).to(device)
  1. 刪除了 target.network 及其相應的軟更新。

  2. 添加總獎勵以檢查算法是否正在學習

    tot_rew = 0
    for it in range(65_000):
        next_state, reward, done, info = env.step(action)
        tot_rew += reward
        if done:
            print("tot_rew = ", tot_rew)
            obs= env.reset()
            tot_rew = 0

這是我最后得到的總獎勵 -

tot_rew =  228.0
tot_rew =  472.0
tot_rew =  243.0
tot_rew =  300.0

這是整個固定代碼 -

import gym
import numpy as np

import torch
from torch import nn
from torch.nn import functional as F
from torch import optim

env = gym.make("CartPole-v1")
def main() -> int:
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Online and offline model for learning

    model = DQN(env.observation_space, env.action_space, 128).to(device)

    target = DQN(env.observation_space, env.action_space, 24).to(device)

    # target.eval()

    # Optimizer and loss function
    optimizer = optim.Adam(model.parameters(), lr=.0001)
    loss_fn = nn.MSELoss()


    memory = Memory(10_000)

    obs = env.reset()
    tot_rew = 0
    for it in range(65_000):
        # print("it = ", it)
        # Do this for the batch norm
        # model.eval()

        # Maybe explore
        if np.random.random() <= epsilon_greedy(1.0, .01, 15_000, it):
            action = env.action_space.sample()
            

            

        else:
            state = wrap_input(obs, device).unsqueeze(0)
            action  = model(state).argmax().item()
            


            # print("epsilon_greedy(1.0, .01, 15_000, it) = ", epsilon_greedy(1.0, .01, 15_000, it))
            
            # print("check = ", model(state).detach().numpy())
            # print("action = ", action)
            


        # Act in environment and store the memory

        next_state, reward, done, info = env.step(action)
        tot_rew += reward
        if done:
            next_state = np.zeros(env.observation_space.shape)
        memory.store([obs, action, reward, int(done), next_state])
        done = done

        obs = next_state

        if done:
            print("tot_rew = ", tot_rew)
            obs= env.reset()
            tot_rew = 0

        # Train
        if len(memory) > 500:
            model.train()
            states, actions, rewards, dones, next_states = memory.sample(128)

            # Wrap and move all values to the cpu

            states = wrap_input(states, device)
            # print("states.shape = ",states.shape)
            actions = wrap_input(actions, device, torch.int64, reshape=True)
            next_states = wrap_input(next_states, device)
            rewards = wrap_input(rewards, device, reshape=True)
            dones = wrap_input(dones, device, reshape=True)

            # Get current q-values
            qs = model(states)
            # print("qs.shape = ", qs.shape)
            qs = torch.gather(qs, dim=1, index=actions)

            # Compute target q-values
            with torch.no_grad():
                next_qs, _ = model(next_states).max(dim=1)
                next_qs = next_qs.reshape(-1, 1)

            target_qs = rewards + .9 * (1 - dones) * next_qs.reshape(-1, 1)

            # Compute loss
            loss = loss_fn(qs, target_qs)
            # print("loss.shape = ", loss)
            optimizer.zero_grad()
            loss.backward()
            
            # Clip gradients
            # nn.utils.clip_grad_norm_(model.parameters(), 1)

            # Backprop
            optimizer.step()

            # soft update
        #     with torch.no_grad():
        #         for target_param, local_param in zip(target.parameters(), model.parameters()):
        #             target_param.data.copy_(1e-2 * local_param.data + (1 - 1e-2) * target_param.data)


        # if it % 200 == 0:
        #     target.load_state_dict(model.state_dict())

# models.py
class FlatExtractor(nn.Module):
    '''Does nothing but pass the input on'''
    def __init__(self, obs_space):
        super(FlatExtractor, self).__init__()

        self.n_flatten = 1

    def forward(self, obs):
        return obs


class DQN(nn.Module):
    def __init__(self, obs_space, act_space, layer_size):
        super(DQN, self).__init__()

        # Feature extractor
        if len(obs_space.shape) == 1:
            self.feature_extractor = env.observation_space.shape[0]

        elif len(obs_space.shape) == 3:
            self.feature_extractor = NatureCnn(obs_space)
        else:
            raise NotImplementedErorr("This type of environment is not supported")
        

        # Neural network
        self.net = nn.Sequential(
            nn.Linear(self.feature_extractor, layer_size),
            nn.ReLU(),
            nn.Linear(layer_size, layer_size),
            nn.ReLU(),
            nn.Linear(layer_size, act_space.n),
        )

    def forward(self, obs):

        return self.net(obs)

# memory.py
import random
from collections import deque

class Memory(object):
    def __init__(self, maxlen):
        self.memory = deque(maxlen=maxlen)

    def store(self, experience):
        self.memory.append(experience)

    def sample(self, n_samples):
        return zip(*random.sample(self.memory, n_samples))

    def __len__(self):
        return len(self.memory)

# utils.py
def wrap_input(arr, device, dtype=torch.float, reshape=False):
    output = torch.from_numpy(np.array(arr)).type(dtype).to(device)
    if reshape:
        output = output.reshape(-1, 1)

    return output

def epsilon_greedy(start, end, n_steps, it):
    return max(start - (start - end) * (it / n_steps), end)

main()

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM