I am a beginner in programing on tensorflow and reinforcement learning. I have made simple program with reinforcement learning algorithm. To be more exact, I've recomposed the example program from book “Hands-On-Reinforcement-Learning-With-Python” by Sudharsan Ravichandiran.
I've taken next example: https://github.com/sudharsan13296/Hands-On-Reinforcement-Learning-With-Python/blob/master/08.%20Atari%20Games%20with%20DQN/8.8%20Building%20an%20Agent%20to%20Play%20Atari%20Games.ipynb
This example is building an agent to play Atari game Pacman using Deep Q Network. In my program I've kept DQN-algorithm and changed the model and state vector. Now the agent is car in 2-D plane. It begin motion in position x=1000, y=1000. According my thoughts Car must travel to position with coordinates x=0, y=0. I set rewards as function f=1/(x1^2+y1^2) - 1/(x0^2+y0^2), where (x0,y0) is the previous position of the car and (x1,y1) – the next position of car. So if car is traveling to position (0,0) the rewards are increasing.
The state vector has only 4 dimension: [ x,y,sin(fi),cos(fi) ], where x, y are the two coordinates of car and fi is the angle of car in 2-D plane. The car has actions. It can turn right or turn left or move in the same direction.
As you can see, this is very simple model. But the DQN-algorithm doesn't learn. It cannot find the good policy and car doesn't travel to position (0,0).
I want to ask the specialist on reinforcement learning to run my simple program in python and find out the problem in this very simple program.
import math
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf2
import tensorflow.compat.v1 as tf
from tensorflow.compat.v1.layers import dense
from collections import deque, Counter
fi = np.float(0)
xx = np.float(0)
yy = np.float(0)
V = np.float(1)
MaxAbsAction = 0.1;
N = 1000;
x_Data = np.zeros(N, dtype=np.float)
y_Data = np.zeros(N, dtype=np.float)
i_D = int(0)
def ModelReset():
global fi, xx, yy, x_Data, y_Data, i_D
fi = np.float(0)
xx = np.float(1000)
yy = np.float(1000)
i_D = int(0);
x_Data[i_D] = xx
y_Data[i_D] = yy
obs = np.array([xx,yy,math.cos(fi),math.sin(fi)],dtype=np.float)
return obs
def ModelStep(action):
global fi, xx, yy, x_Data, y_Data
global MaxAbsAction, i_D, V, N
PreviousDistance = math.sqrt( xx*xx + yy*yy )
if action > MaxAbsAction:
action = MaxAbsAction
if action < -MaxAbsAction:
action = -MaxAbsAction
fi += action
xx += V * math.cos(fi)
yy += V * math.sin(fi)
i_D += 1
x_Data[i_D] = xx
y_Data[i_D] = yy
NextDistance = math.sqrt( xx*xx + yy*yy )
reward = (1/NextDistance - 1/PreviousDistance)
next_obs = np.array([xx,yy,math.cos(fi),math.sin(fi)],dtype=np.float)
done = i_D>=N-1 or NextDistance < 20
return next_obs, reward, done
n_outputs = 3
def n_to_float_action(n):
global MaxAbsAction
return -MaxAbsAction + 2*MaxAbsAction/(n_outputs-1)*n
def Q_Network(X, name_scope) :
initializer = tf.keras.initializers.VarianceScaling()
with tf.variable_scope(name_scope) as scope:
fc1 = dense(X, 100, kernel_initializer=initializer,
activation=tf.keras.activations.sigmoid)
fc2 = dense(fc1, 100, kernel_initializer=initializer,
activation=tf.keras.activations.relu)
output = dense( fc2, n_outputs,
kernel_initializer=initializer )
Q_vars = {v.name[len(scope.name):]: v for v in
tf.get_collection(key=tf.GraphKeys.TRAINABLE_VARIABLES,
scope=scope.name)}
return Q_vars, output
epsilon = 0.1
eps_min = 0.01
eps_max = 1
eps_decay_steps = 5000000
def epsilon_greedy(action,step):
p = np.random.random(1).squeeze()
epsilon = max(eps_min, eps_max-(eps_max-eps_min)*step/eps_decay_steps)
if np.random.rand(1) < epsilon:
return np.random.randint(n_outputs), epsilon
else:
return action, epsilon
def sample_memories(batch_size):
if exp_buffer_full:
size_buff = exp_buffer_length
else:
size_buff = exp_buffer_pos
perm_batch = np.random.permutation(size_buff)[:batch_size]
mem = exp_buffer[perm_batch]
return mem[:,0],mem[:,1],mem[:,2],mem[:,3],mem[:,4]
num_episodes = 100000
batch_size = 200
learning_rate = 0.001
X_shape = (None,4)
discount_factor = 0.97
global_step = 0
copy_steps = 10000
steps_train = 40
start_steps = 2000
logdir = 'logs'
exp_buffer_length = 1000000
exp_buffer_pos = 0;
exp_buffer_full = False
exp_buffer = np.zeros(shape=(exp_buffer_length,5), dtype=object)
tf.compat.v1.disable_eager_execution()
X = tf.placeholder(tf.float32, shape=X_shape,name='X')
in_training_mode = tf.placeholder(tf.bool,name='in_training_mode')
mainQ, mainQ_outputs = Q_Network(X,'maimQ')
targetQ, targetQ_outputs = Q_Network(X,'targetQ')
X_action = tf.placeholder(tf.int32, shape=(None,),name='X_action')
Q_action = tf.reduce_sum(
targetQ_outputs * tf.one_hot(X_action, n_outputs),
axis=-1, keep_dims=True )
copy_op = [tf.assign(main_name,targetQ[var_name])
for var_name, main_name in mainQ.items() ]
copy_target_to_main = tf.group(*copy_op)
y = tf.placeholder( tf.float32, shape=(None,1), name='y' )
loss = tf.reduce_mean( tf.square(y-Q_action) )
optimazer = tf.train.AdamOptimizer(learning_rate)
training_op = optimazer.minimize(loss)
loss_summary = tf.summary.scalar('LOSS',loss)
merge_summary = tf.summary.merge_all()
file_writer = tf.summary.FileWriter(logdir,tf.get_default_graph())
train_loss = None
init = tf.global_variables_initializer()
with tf.Session() as sess:
init.run()
for i in range(num_episodes):
done = False
obs = ModelReset()
epoch = 0
episodic_reward = 0
action_counter = Counter()
episodic_loss = []
while not done:
actions = mainQ_outputs.eval (
feed_dict={X:[obs], in_training_mode:False})
action = np.argmax(actions,axis=-1)
action_counter[str(action)] += 1
action, epsilonn = epsilon_greedy(action, global_step)
next_obs, reward, done = ModelStep(n_to_float_action(action))
exp_buffer[exp_buffer_pos,:] = np.array([obs, action, next_obs, reward, done],dtype=object)
exp_buffer_pos += 1
if exp_buffer_pos >= exp_buffer_length:
exp_buffer_pos = 0
exp_buffer_full = True
if global_step % steps_train == 0 and global_step > start_steps:
o_obs, o_act, o_next_obs, o_rew, o_done = sample_memories(batch_size)
o_obs = [x for x in o_obs]
o_next_obs = [x for x in o_next_obs]
next_act = mainQ_outputs.eval(
feed_dict={X:o_next_obs,in_training_mode:False})
y_batch = o_rew + discount_factor * np.max(next_act,axis=-1)
train_loss, _ = sess.run( [loss, training_op],
feed_dict={X:np.array(o_obs,dtype=np.float),
y:np.expand_dims(
np.array(y_batch,dtype=np.float),axis=-1),
X_action:np.array(o_act,dtype=np.int32),
in_training_mode:True } )
if (global_step+1) % copy_steps == 0 and global_step > start_steps:
copy_target_to_main.run()
print('copy_target_to_main.run()')
obs = next_obs
epoch += 1
global_step += 1
episodic_reward += reward
print('Episode', i, 'Reward', episodic_reward, 'epsilon', epsilonn,
'loss', train_loss )
if (i+1) % 100 == 0:
plt.plot(x_Data,y_Data)
plt.show()
I've found out the problem in my simple program. I must normalize state vector and rewards. So the values must be in interval [-1, 1]. But I haven't done this. When I do this my simple program starts working very well.
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.