[英]python policy gradient reinforcement learning with continous action space is not working
我正在嘗試學習代理以在我的自定義環境中導航到目標。 該代理正在使用 neural.net 進行學習(2 個隱藏的密集層、一個 dropout 和一個 output 維度 4 層)。 作為輸入節點,代理使用傳感器測量到周圍障礙物的距離,以及到目標的角度和距離(如果在視野中)。 (我將所有輸入標准化為在 [0,1] 范圍內)。 作為 output,我們得到了下一個動作的均值和標准差(機器人將移動的角度和距離)。 所采取的行動然后由正態分布的高斯策略采樣。 我使用了高斯策略,因為這是我看到的唯一可能定義對數似然的方法,我想在訓練期間使用它。 在訓練中,智能體沿着軌跡行走,直到它撞到障礙物或牆壁,然后計算每一步之后的折扣未來獎勵。 我現在想使用策略梯度上升來訓練代理以獲得更大的回報。 你也可以在這里找到這個想法: https://spinningup.openai.com/en/latest/spinningup/rl_intro.html我將在下面發布我的 python 代碼方法,但不知何故它不起作用。 也許我錯過了什么。 我真的很絕望,非常感謝任何提示。 (關鍵部分是class Agent中的train()方法)
class 迷宮:
def __init__(self, length, height, target, obstacles=[]):
self.length =length
self.height = height
self.obstacles=obstacles
self.target=target
def drawMaze(self, res=0.01):
#plot maze outline:
x = [0,0,self.length,self.length,0]
y = [0,self.height, self.height,0,0]
plt.plot(x, y)
#plotting maze targets
plt.plot(self.target.pos_x, self.target.pos_y,'ro', label='target')
#plotting maze obstacles with resolution res
x_mesh = arange(0,self.length,res)
y_mesh = arange(0,self.height,res)
X_mesh,Y_mesh = meshgrid(x_mesh, y_mesh) # grid of point
Z=np.zeros(X_mesh.size).reshape(y_mesh.size,x_mesh.size)
for obstacle in self.obstacles:
Z = np.maximum(Z, obstacle.func(X_mesh, Y_mesh)) # evaluation of the function on the grid
plt.imshow(np.heaviside(np.array([z.astype(float) for z in Z]),0),extent=[0,self.length,0,self.height], cmap='Pastel1', origin='lower') #dmap='Greys'
plt.title('MyMaze')
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.1),fancybox=True, shadow=True, ncol=5) #legende unter plot
class 障礙:
def __init__(self, func):
self.func=func #func(point)>0 if point lies in obstacle and <= 0 if not, function should be continous such that algorithm can find the minmum of the function
def isPointInObstacle(self, point):
if self.func(point)>=0:
return 1
else:
return 0
class 目標:
def __init__(self, pos_x, pos_y):
self.pos_x=pos_x
self.pos_y=pos_y
class 代理:
def __init__(self, maze, pos_x=1, pos_y=1, rays=360, alpha=0.001):
self.posvec=[[pos_x, pos_y]] #np.array([[pos_x, pos_y]])
self.rays=rays
self.view=[] #distance vector to next wall/obstacle
self.points=[] #point vector to next wall/obstacle
self.maze=maze
self.memory = [] #np.asarray([ [0]*(rays+4) for i in range(10)],dtype=np.float64)
self.memoryCounter=0
self.lastMove=np.array([0,0])
self.model = keras.Sequential()
self.model.add(layers.Dense(64,batch_input_shape=(1,rays+2) ,activation="relu"))
self.model.add(layers.Dense(64,batch_input_shape=(1,rays+2) ,activation="relu"))
self.model.add(layers.Dropout(.2))
self.model.add(layers.Dense(4))
self.reward=[]
self.isEnd=False
self.gamma=0.3 #comulative reward factor #gamma<=0.5 such that it can not be efficient to run infinitely long and gather rewards
self.learningRate=alpha
self.model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=self.learningRate))
self.actionMemory = []
self.updateMemory()
def look(self, res=0.01): #divides 360 degrees in rays and returns vector points of endpoints (ray hitting maze or obstacle) of each of these rays
rays=self.rays
angle=arange(0,360,360/rays)
points=[]
distance=[]
#intersection with maze boarders
for alpha in angle:
if (alpha==0):
t_min_maze=(self.maze.height-self.posvec[-1][1])/math.cos(math.radians(alpha))
if (alpha==180):
t_min_maze=(0-self.posvec[-1][1])/math.cos(math.radians(alpha))
if (alpha==90):
t_min_maze=(self.maze.length-self.posvec[-1][0])/math.sin(math.radians(alpha))
if (alpha==270):
t_min_maze=(0-self.posvec[-1][0])/math.sin(math.radians(alpha))
if (alpha>0 and alpha <90):
t_min_maze=min((self.maze.length-self.posvec[-1][0])/math.sin(math.radians(alpha)),(self.maze.height-self.posvec[-1][1])/math.cos(math.radians(alpha)))
if (alpha>90 and alpha <180):
t_min_maze=min((self.maze.length-self.posvec[-1][0])/math.sin(math.radians(alpha)), (0-self.posvec[-1][1])/math.cos(math.radians(alpha)))
if (alpha>180 and alpha <270):
t_min_maze=min((0-self.posvec[-1][0])/math.sin(math.radians(alpha)),(0-self.posvec[-1][1])/math.cos(math.radians(alpha)))
if alpha>270:
t_min_maze=min((0-self.posvec[-1][0])/math.sin(math.radians(alpha)),(self.maze.height-self.posvec[-1][1])/math.cos(math.radians(alpha)))
'''
t_min_right=(self.maze.length-self.posvec[-1,0])/sin(alpha)
t_min_left=(0-self.posvec[-1,0])/sin(alpha)
t_min_up=(self.maze.height-self.posvec[-1,1])/cos(alpha)
t_min_down=(0-self.posvec[-1,1])/cos(alpha)
'''
#intersection with obstacles
t_min_obstacle=t_min_maze
for obstacle in self.maze.obstacles:
def ray(t):
x=self.posvec[-1][0]+t*math.sin(math.radians(alpha))
y=self.posvec[-1][1]+t*math.cos(math.radians(alpha))
return obstacle.func(x,y)
for t_ray in np.arange(0,t_min_obstacle+1,res):
if (ray(t_ray)>0 and t_ray<t_min_maze and t_ray>0):
t_min_obstacle=t_ray
break
t_min=t_min_obstacle
#calculate end points of beam:
distance.append(t_min)
points.append([self.posvec[-1][0]+t_min*math.sin(math.radians(alpha)),self.posvec[-1][1]+t_min*math.cos(math.radians(alpha))])
self.view=distance
self.points=points
return points
def run(self, angle, distance):
newaction=[angle, distance]
self.actionMemory.append(newaction)
'''
checks weather the path is free of obstacles, if so then walks the path if not only walks until hits the obstacle
updates self.memory
updates self.isEnd
updates self.posvec
'''
if(self.isEnd):
print("Agent has already crashed with an Obstacle, please reset Agent with command Agentname.reset()")
return
else:
result=self.IsWalkPossible(angle, distance)
isRunPossible=result[0]
t_ray=result[1]
if(not isRunPossible):
print("Agent crashed with an Obstacle, please reset Agent with command Agentname.reset()")
self.isEnd=not isRunPossible
self.posvec.append([(self.posvec[-1][0] + distance*t_ray*math.sin(math.radians(angle)))[0], (self.posvec[-1][1] + distance*t_ray*math.cos(math.radians(angle)))[0]])
self.lastMove=[angle, t_ray*distance]
self.updateMemory()
if(not isRunPossible):
self.draw(0.01, False, True)
return
def draw(self, res=0.01, plotView=True, drawPath=True):
self.maze.drawMaze(res)
#plot green lines
if (self.points is not [] and plotView):
for point in self.points:
plt.plot([self.posvec[-1][0],point[0]],[self.posvec[-1][1],point[1]],'g')
#plot agent:
plt.plot(self.posvec[-1][0],self.posvec[-1][1],'bx', label='agent')
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.1),fancybox=True, shadow=True, ncol=5)
if (drawPath):
self.drawPath()
def drawPath(self):
lastpoint=[self.posvec[0][0],self.posvec[0][1]]
for point in self.posvec:
plt.plot([lastpoint[0],point[0]],[lastpoint[1],point[1]],'b')
lastpoint=point
def seeTarget(self, res=0.01):
'''
check weather there is an obstacle in direct line from agent to target
if agent can see target return True
Else: return False
'''
for obstacle in self.maze.obstacles:
#parametrize line from agent to target
def ray(t):
x=self.posvec[-1][0]+t*(self.maze.target.pos_x-self.posvec[-1][0])
y=self.posvec[-1][1]+t*(self.maze.target.pos_y-self.posvec[-1][1])
return obstacle.func(x,y)
for t_ray in np.arange(0,1,res):
if ray(t_ray)>0:
return False
if ray(1)>0:
return False
return True
def IsWalkPossible(self, angle, distance, res=0.01):
'''
if agent would cross wall while walking the track, return False
Else: return True
'''
def pos(t):
x=self.posvec[-1][0]+t*distance*(math.sin(math.radians(angle)))
y=self.posvec[-1][1]+t*distance*(math.cos(math.radians(angle)))
return x,y
for t_ray in np.arange(0,1,res):
if pos(t_ray)[0]<0 or pos(t_ray)[0]>self.maze.length or pos(t_ray)[1]<0 or pos(t_ray)[1]>self.maze.height:
return False, t_ray
if pos(1)[0]<0 or pos(1)[0]>self.maze.length or pos(1)[1]<0 or pos(1)[1]>self.maze.height:
return False, 1
'''
if agent would cross obstacle while walking the track, return False
Else: return True
'''
for obstacle in self.maze.obstacles:
#parametrize line from agent to target
def ray(t):
x=self.posvec[-1][0]+t*distance*(math.sin(math.radians(angle)))
y=self.posvec[-1][1]+t*distance*(math.cos(math.radians(angle)))
return obstacle.func(x,y)
for t_ray in np.arange(0,1,res):
if ray(t_ray)>0:
return False, t_ray
if ray(1)>0:
return False, 1
return True, 1
def angleDistanceTarget(self):
v1=[0,1]
v2=[self.maze.target.pos_x-self.posvec[-1][0],self.maze.target.pos_y-self.posvec[-1][1]]
angle=math.acos(np.dot(v1,v2/np.linalg.norm(v2)))
if (self.maze.target.pos_x-self.posvec[-1][0]<0):
return 360-np.rad2deg(angle),np.linalg.norm(v2)
else:
return np.rad2deg(angle), np.linalg.norm(v2)
def updateMemory(self):
if (not self.isEnd):
self.look()
newMemory=self.view
if (self.seeTarget()):
angle, distance=self.angleDistanceTarget()
newMemory.append(angle)
newMemory.append(distance)
else:
newMemory.append(-1)
newMemory.append(-1)
self.memory.append(newMemory)
if self.memoryCounter>0:
self.reward.append(self.rewardFunction())
self.memoryCounter+=1
def reset(self):
self.posvec=[[random.random()*self.maze.length, random.random()*self.maze.height]]
self.view=[] #distance vector to next wall/obstacle
self.points=[] #point vector to next wall/obstacle
self.memory = []
self.memoryCounter=0
self.lastMove=np.array([0,0])
self.isEnd=False
self.reward=[]
self.updateMemory()
def evaluate(self):
return
def rewardFunction(self):
if (self.isEnd):
print("crashed Obstacle")
return -1
else:
distance_target_squared=(self.posvec[-1][0]-self.maze.target.pos_x)**2+(self.posvec[-1][1]-self.maze.target.pos_y)**2
if distance_target_squared==0:
print("Agent reached target")
self.isEnd=True
return np.exp(-distance_target_squared)
def train(self, num_weight_updates=50):
print(self.model.summary())
maze_diag=(self.maze.height**2+self.maze.length**2)**0.5
for i in range(num_weight_updates):
print("weight Update Numer:", i)
#create a new full trajectory of agent
self.reset()
gradient=[]
iteration=0
while(not self.isEnd and iteration<20):
with tf.GradientTape() as tape:
iteration +=1
print("iteration:",iteration)
insertMemory=np.expand_dims(self.memory[-1], axis=0) # mein self.memory wird die shape nicht verändert
insertMemory=np.true_divide(insertMemory, maze_diag) #damit Längen maximal 1 sind
insertMemory[0,-2]=insertMemory[0,-2]*maze_diag/360 #damit Winkel Norm 1 hat
probs=self.model(insertMemory)
meannewangle=probs[0,0]*360
meannewdistance=probs[0,1]*maze_diag
stdnewangle=np.exp(probs[0,2])
stdnewdistance=np.exp(probs[0,3])
random_normal=tf.random.normal([1],0,1,tf.float32, seed=1).numpy()
run_angle=meannewangle+random_normal*stdnewangle
run_distance=meannewdistance+random_normal*stdnewdistance
log_likelihood=-0.5*((run_angle-meannewangle)**2/stdnewangle**2+2*math.log(stdnewangle)+(run_distance-meannewdistance)**2/stdnewdistance**2+2*math.log(stdnewdistance)+2*math.log(2*math.pi))
gradient.append(tape.gradient(log_likelihood,self.model.trainable_variables))
run_distance=np.exp(run_distance) #such that it is positive
self.run(run_angle, run_distance)
self.draw()
#calculate discounted reward:
discountedReward = np.zeros_like(self.reward)
for t in range(len(self.reward)):
discount = 1
tmp=0
for k in range(t, len(self.reward)):
tmp += self.reward[k] * discount
discount *= self.gamma
discountedReward[t] = tmp
#update weights
for i in range (iteration):
grad=[old_grad*discountedReward[i] for old_grad in gradient[i]] #multiplication with learning rate is done by Adam optimizer
self.model.optimizer.apply_gradients(zip(grad, self.model.trainable_variables))
def play(self):
maze_diag=(self.maze.height**2+self.maze.length**2)**0.5
self.reset()
iteration=0
while(not self.isEnd and iteration<10):
iteration +=1
print("iteration:",iteration)
insertMemory=np.expand_dims(self.memory[-1], axis=0) # mein self.memory wird die shape nicht verändert
insertMemory=np.true_divide(insertMemory, maze_diag) #damit Längen maximal 1 sind
insertMemory[0,-2]=insertMemory[0,-2]*maze_diag/360 #damit Winkel Norm 1 hat
probs=self.model(insertMemory)
meannewangle=probs[0,0]
meannewdistance=probs[0,1]
stdnewangle=np.exp(probs[0,2])
stdnewdistance=np.exp(probs[0,3])
random_normal=tf.random.normal([1],0,1,tf.float32, seed=1).numpy()
run_angle=meannewangle+random_normal*stdnewangle
run_distance=meannewdistance+random_normal*stdnewdistance
run_distance=np.exp(run_distance) #such that it is positive
print("angle:",run_angle,"distance:",run_distance)
self.run(run_angle, run_distance)
self.draw()
我知道這真的很長,但我想也許其他地方有錯誤,所以如果你想測試代碼,毫無障礙地測試它就足夠了,因為即使在這種簡單的情況下,即使我訓練它也無法正常工作500次迭代:(
只需運行以下代碼:
listOfObstacles = []
myTarget=Target(1,2)
myMaze=Maze(5,3,myTarget,listOfObstacles)
myAgent=Agent(myMaze, 4,1, 10)
myAgent.train()
myAgent.play()
500 次迭代對於強化學習問題來說真的很低,尤其是當涉及神經網絡時。 嘗試增加它(增加很多 -至少100.000 次迭代),並嘗試在每一集結束時查看獎勵,看看它是否有所增加。
然后,如果在多次迭代后您看到零進度,則應檢查您是否以正確的方式實施算法,或者環境中是否存在任何隱藏的錯誤。
要解決第一個問題,您應該嘗試一些 RL 庫,例如穩定的基線,以確保您的算法實現中沒有錯誤。 然后,如果它仍然不起作用,您應該在您的代碼中尋找錯誤,或者重新考慮您定義此 RL 問題的方式。
查看類似的問題並檢查他們如何定義 MDP,尤其是獎勵 function。
此外,刪除那個 dropout 層:在監督學習設置中,這確實通常有助於減少過度擬合(通過引入一些方差),但在 RL 中,額外的方差並不是我們真正想要的。
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.