![](/img/trans.png)
[英]Tensorflow reinforcement Learning Model will barely ever make a decision on its own and will not learn.
[英]My the simple reinforcement learning model doesn't learn. I don't know why
我是 tensorflow 和強化學習編程的初學者。 我用強化學習算法制作了簡單的程序。 更准確地說,我已經從 Sudharsan Ravichandiran 的“Hands-On-Reinforcement-Learning-With-Python”一書中重新編寫了示例程序。
這個例子是使用 Deep Q 網絡構建一個代理來玩 Atari 游戲吃豆人。 在我的程序中,我保留了 DQN 算法並更改了 model 和 state 向量。 現在代理是二維平面上的汽車。 它在 position x=1000, y=1000 開始運動。 根據我的想法,汽車必須以坐標 x=0,y=0 行駛到 position。 我將獎勵設置為 function f=1/(x1^2+y1^2) - 1/(x0^2+y0^2),其中 (x0,y0) 是前車的 position 和 (x1,y1) – 汽車的下一個 position。 因此,如果汽車行駛到 position (0,0),獎勵就會增加。
state 向量只有 4 維:[ x,y,sin(fi),cos(fi) ],其中 x,y 是汽車的兩個坐標,fi 是汽車在二維平面中的角度。 車有動作。 它可以右轉或左轉或向同一方向移動。
如您所見,這是非常簡單的 model。 但是 DQN 算法不會學習。 它找不到好的政策,汽車沒有行駛到 position (0,0)。
我想請強化學習專家在 python 中運行我的簡單程序,並找出這個非常簡單的程序中的問題。
import math
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf2
import tensorflow.compat.v1 as tf
from tensorflow.compat.v1.layers import dense
from collections import deque, Counter
fi = np.float(0)
xx = np.float(0)
yy = np.float(0)
V = np.float(1)
MaxAbsAction = 0.1;
N = 1000;
x_Data = np.zeros(N, dtype=np.float)
y_Data = np.zeros(N, dtype=np.float)
i_D = int(0)
def ModelReset():
global fi, xx, yy, x_Data, y_Data, i_D
fi = np.float(0)
xx = np.float(1000)
yy = np.float(1000)
i_D = int(0);
x_Data[i_D] = xx
y_Data[i_D] = yy
obs = np.array([xx,yy,math.cos(fi),math.sin(fi)],dtype=np.float)
return obs
def ModelStep(action):
global fi, xx, yy, x_Data, y_Data
global MaxAbsAction, i_D, V, N
PreviousDistance = math.sqrt( xx*xx + yy*yy )
if action > MaxAbsAction:
action = MaxAbsAction
if action < -MaxAbsAction:
action = -MaxAbsAction
fi += action
xx += V * math.cos(fi)
yy += V * math.sin(fi)
i_D += 1
x_Data[i_D] = xx
y_Data[i_D] = yy
NextDistance = math.sqrt( xx*xx + yy*yy )
reward = (1/NextDistance - 1/PreviousDistance)
next_obs = np.array([xx,yy,math.cos(fi),math.sin(fi)],dtype=np.float)
done = i_D>=N-1 or NextDistance < 20
return next_obs, reward, done
n_outputs = 3
def n_to_float_action(n):
global MaxAbsAction
return -MaxAbsAction + 2*MaxAbsAction/(n_outputs-1)*n
def Q_Network(X, name_scope) :
initializer = tf.keras.initializers.VarianceScaling()
with tf.variable_scope(name_scope) as scope:
fc1 = dense(X, 100, kernel_initializer=initializer,
activation=tf.keras.activations.sigmoid)
fc2 = dense(fc1, 100, kernel_initializer=initializer,
activation=tf.keras.activations.relu)
output = dense( fc2, n_outputs,
kernel_initializer=initializer )
Q_vars = {v.name[len(scope.name):]: v for v in
tf.get_collection(key=tf.GraphKeys.TRAINABLE_VARIABLES,
scope=scope.name)}
return Q_vars, output
epsilon = 0.1
eps_min = 0.01
eps_max = 1
eps_decay_steps = 5000000
def epsilon_greedy(action,step):
p = np.random.random(1).squeeze()
epsilon = max(eps_min, eps_max-(eps_max-eps_min)*step/eps_decay_steps)
if np.random.rand(1) < epsilon:
return np.random.randint(n_outputs), epsilon
else:
return action, epsilon
def sample_memories(batch_size):
if exp_buffer_full:
size_buff = exp_buffer_length
else:
size_buff = exp_buffer_pos
perm_batch = np.random.permutation(size_buff)[:batch_size]
mem = exp_buffer[perm_batch]
return mem[:,0],mem[:,1],mem[:,2],mem[:,3],mem[:,4]
num_episodes = 100000
batch_size = 200
learning_rate = 0.001
X_shape = (None,4)
discount_factor = 0.97
global_step = 0
copy_steps = 10000
steps_train = 40
start_steps = 2000
logdir = 'logs'
exp_buffer_length = 1000000
exp_buffer_pos = 0;
exp_buffer_full = False
exp_buffer = np.zeros(shape=(exp_buffer_length,5), dtype=object)
tf.compat.v1.disable_eager_execution()
X = tf.placeholder(tf.float32, shape=X_shape,name='X')
in_training_mode = tf.placeholder(tf.bool,name='in_training_mode')
mainQ, mainQ_outputs = Q_Network(X,'maimQ')
targetQ, targetQ_outputs = Q_Network(X,'targetQ')
X_action = tf.placeholder(tf.int32, shape=(None,),name='X_action')
Q_action = tf.reduce_sum(
targetQ_outputs * tf.one_hot(X_action, n_outputs),
axis=-1, keep_dims=True )
copy_op = [tf.assign(main_name,targetQ[var_name])
for var_name, main_name in mainQ.items() ]
copy_target_to_main = tf.group(*copy_op)
y = tf.placeholder( tf.float32, shape=(None,1), name='y' )
loss = tf.reduce_mean( tf.square(y-Q_action) )
optimazer = tf.train.AdamOptimizer(learning_rate)
training_op = optimazer.minimize(loss)
loss_summary = tf.summary.scalar('LOSS',loss)
merge_summary = tf.summary.merge_all()
file_writer = tf.summary.FileWriter(logdir,tf.get_default_graph())
train_loss = None
init = tf.global_variables_initializer()
with tf.Session() as sess:
init.run()
for i in range(num_episodes):
done = False
obs = ModelReset()
epoch = 0
episodic_reward = 0
action_counter = Counter()
episodic_loss = []
while not done:
actions = mainQ_outputs.eval (
feed_dict={X:[obs], in_training_mode:False})
action = np.argmax(actions,axis=-1)
action_counter[str(action)] += 1
action, epsilonn = epsilon_greedy(action, global_step)
next_obs, reward, done = ModelStep(n_to_float_action(action))
exp_buffer[exp_buffer_pos,:] = np.array([obs, action, next_obs, reward, done],dtype=object)
exp_buffer_pos += 1
if exp_buffer_pos >= exp_buffer_length:
exp_buffer_pos = 0
exp_buffer_full = True
if global_step % steps_train == 0 and global_step > start_steps:
o_obs, o_act, o_next_obs, o_rew, o_done = sample_memories(batch_size)
o_obs = [x for x in o_obs]
o_next_obs = [x for x in o_next_obs]
next_act = mainQ_outputs.eval(
feed_dict={X:o_next_obs,in_training_mode:False})
y_batch = o_rew + discount_factor * np.max(next_act,axis=-1)
train_loss, _ = sess.run( [loss, training_op],
feed_dict={X:np.array(o_obs,dtype=np.float),
y:np.expand_dims(
np.array(y_batch,dtype=np.float),axis=-1),
X_action:np.array(o_act,dtype=np.int32),
in_training_mode:True } )
if (global_step+1) % copy_steps == 0 and global_step > start_steps:
copy_target_to_main.run()
print('copy_target_to_main.run()')
obs = next_obs
epoch += 1
global_step += 1
episodic_reward += reward
print('Episode', i, 'Reward', episodic_reward, 'epsilon', epsilonn,
'loss', train_loss )
if (i+1) % 100 == 0:
plt.plot(x_Data,y_Data)
plt.show()
我在我的簡單程序中發現了問題。 我必須規范化 state 向量和獎勵。 所以這些值必須在區間 [-1, 1] 內。 但我沒有這樣做。 當我這樣做時,我的簡單程序開始運行良好。
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.