简体   繁体   中英

How to make softmax work with policy gradient?

I am trying to change Karpathy's code so that it works with softmax function so that I can use it for game with more than 2 actions. However, I cannot get it to work. Can someone help point me to the right direction please? Thanks. Below is my attempt.

""" Trains an agent with (stochastic) Policy Gradients on Pong. Uses OpenAI Gym. """
import numpy as np
import cPickle as pickle
import gym

# hyperparameters
H = 100 # number of hidden layer neurons
batch_size = 10 # every how many episodes to do a param update?
learning_rate = 1e-4
gamma = 0.9 # discount factor for reward
decay_rate = 0.9 # decay factor for RMSProp leaky sum of grad^2
resume = False # resume from previous checkpoint?
render = False
num_action = 2

# model initialization
D = 6 # input dimensionality: 80x80 grid
if resume:
  model = pickle.load(open('save.p', 'rb'))
else:
  model = {}
  model['W1'] = np.random.randn(H,D) / np.sqrt(D) # "Xavier" initialization
  model['W2'] = np.random.randn(num_action, H) / np.sqrt(H)

grad_buffer = { k : np.zeros_like(v) for k,v in model.iteritems() } # update buffers that add up gradients over a batch
rmsprop_cache = { k : np.zeros_like(v) for k,v in model.iteritems() } # rmsprop memory

def sigmoid(x): 
  return 1.0 / (1.0 + np.exp(-x)) # sigmoid "squashing" function to interval [0,1]

def softmax(w, t = 1.0):
    e = np.exp(np.array(w) / t)
    dist = e / np.sum(e)
    return dist

def prepro(I):
  """ prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
  I = I[35:195] # crop
  I = I[::2,::2,0] # downsample by factor of 2
  I[I == 144] = 0 # erase background (background type 1)
  I[I == 109] = 0 # erase background (background type 2)
  I[I != 0] = 1 # everything else (paddles, ball) just set to 1
  return I.astype(np.float).ravel()

def discount_rewards(r):
  """ take 1D float array of rewards and compute discounted reward """
  discounted_r = np.zeros_like(r)
  running_add = 0
  for t in reversed(xrange(0, r.size)):
    if r[t] != 0: running_add = 0 # reset the sum, since this was a game boundary (pong specific!)
    running_add = running_add * gamma + r[t]
    discounted_r[t] = running_add
  return discounted_r

def policy_forward(x):
  h = np.dot(model['W1'], x)
  h[h<0] = 0 # ReLU nonlinearity
  logp = np.dot(model['W2'], h)
  p = softmax(logp)
  return p, h # return probability of taking action 2, and hidden state

def policy_backward(eph, epdlogp):
  """ backward pass. (eph is array of intermediate hidden states) """
  # print eph.shape
  # print epdlogp.shape
  # print model['W2'].shape
  # dW2 = np.dot(eph.T, epdlogp).ravel()
  # dh = np.outer(epdlogp, model['W2'])
  # dh[eph <= 0] = 0 # backpro prelu
  # dW1 = np.dot(dh.T, epx)
  # return {'W1':dW1, 'W2':dW2}
  dW2 = np.dot(eph.T, epdlogp).T
  # print dW2.shape
  dh = np.dot(epdlogp, model['W2'])
  # print dh.shape
  dh[eph <= 0] = 0 # backpro prelu
  dW1 = np.dot(dh.T, epx)
  return {'W1':dW1, 'W2':dW2}




env = gym.make("Acrobot-v1")
observation = env.reset()
prev_x = None # used in computing the difference frame
xs,hs,dlogps,drs = [],[],[],[]
running_reward = None
reward_sum = 0
episode_number = 0
while True:
  if render: env.render()

  # preprocess the observation, set input to network to be difference image
  cur_x = observation
  x = cur_x - prev_x if prev_x is not None else np.zeros(D)
  prev_x = cur_x

  # forward the policy network and sample an action from the returned probability
  aprob, h = policy_forward(x)
  action = np.argmax(aprob)
  if action == 1:
    action = 2
  # action = 2 if np.random.uniform() > aprob[1] else 0
  # print aprob

  # action = 2 if np.random.uniform() < aprob else 3 # roll the dice!

  # record various intermediates (needed later for backprop)
  xs.append(x) # observation
  hs.append(h) # hidden state

  # if action == 0:
  #   y = [1,0,0]
  # elif action == 1:
  #   y = [0,1,0]
  # else:
  #   y = [0,0,1]


  y = [1,0] if action == 0 else [0,1] # a "fake label"

  dlogps.append(aprob-y) # grad that encourages the action that was taken to be taken (see http://cs231n.github.io/neural-networks-2/#losses if confused)

  # step the environment and get new measurements
  observation, reward, done, info = env.step(action)
  reward_sum += reward

  drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action)

  if done: # an episode finished
    episode_number += 1

    # stack together all inputs, hidden states, action gradients, and rewards for this episode
    epx = np.vstack(xs)
    eph = np.vstack(hs)
    epdlogp = np.vstack(dlogps)
    epr = np.vstack(drs)
    xs,hs,dlogps,drs = [],[],[],[] # reset array memory

    # compute the discounted reward backwards through time
    discounted_epr = discount_rewards(epr)
    # standardize the rewards to be unit normal (helps control the gradient estimator variance)
    discounted_epr -= np.mean(discounted_epr)
    discounted_epr /= np.std(discounted_epr)

    epdlogp *= discounted_epr # modulate the gradient with advantage (PG magic happens right here.)
    grad = policy_backward(eph, epdlogp)
    for k in model: grad_buffer[k] += grad[k] # accumulate grad over batch

    # perform rmsprop parameter update every batch_size episodes
    if episode_number % batch_size == 0:
      for k,v in model.iteritems():
        g = grad_buffer[k] # gradient
        rmsprop_cache[k] = decay_rate * rmsprop_cache[k] + (1 - decay_rate) * g**2
        model[k] += learning_rate * g / (np.sqrt(rmsprop_cache[k]) + 1e-5)
        grad_buffer[k] = np.zeros_like(v) # reset batch gradient buffer

    # boring book-keeping
    running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
    print 'resetting env. episode reward total was %f. running mean: %f' % (reward_sum, running_reward)
    if episode_number % 100 == 0: pickle.dump(model, open('save.p', 'wb'))
    reward_sum = 0
    observation = env.reset() # reset env
    prev_x = None

When debugging, this code runs into a "nan" issue which I can't figure out how to fix.

I think the NaN problem that you mention in a comment is due to your Softmax function.

Softmax computes the exponential function, exp(x) which can easily exceed the range of single or double precision floats for moderate values of x. This would cause exp to return NaN .

Solution

The mathematical form of Softmax is:

s[i] = exp(x[i]) / (exp(x[0]) + exp(x[1]) + .. + exp(x[n-1]))

We can divide the numerator and denominator of this expression by an arbitrary value, say exp(a) without affecting the result.

s[i] = (exp(x[i])/exp(a)) / ((exp(x[0]) + exp(x[1]) + .. + exp(x[n-1])/exp(a)))

s[i] = exp(x[i]-a) / (exp(x[0]-a) + exp(x[1]-a) + .. + exp(x[n-1]-a))

If we let a = max(x) then all exponents will be zero or negative, so no call to exp will return NaN.

I don't use Python or numpy, but I imagine you could define softmax something like:

def softmax(w):
    a = np.max(w)
    e = np.exp(np.array(w) - a)
    dist = e / np.sum(e)
    return dist

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM