[英]q-learning: ValueError: 'a' cannot be empty unless no samples are taken
我嘗試開發用於強化學習的 q-learning 算法,這是我的代碼:
import numpy as np
R = np.matrix ([[-1, 0, -1, -1, 0, -1, -1, -1, -1],
[-1, -1, 100, 0, -1, -1, -1, -1, -1],
[-1, -1, 100, -1, -1, -1, -1, -1, -1],
[-1, -1, -1, -1, -1, -1, -1, -1, -1],
[-1, -1, -1, -1, -1, 100, 0, -1, -1],
[-1, -1, -1, -1, -1, 100, -1, -1, -1],
[-1, -1, -1, -1, -1, -1, -1, 100, 0],
[-1, -1, -1, -1, -1, -1, -1, 100, -1],
[-1, -1, -1, -1, -1, -1, -1, -1, -1]])
# Q matrix
Q = np.matrix(np.zeros([9,9]))
# Gamma (learning parameter)
gamma = 0.4
# Initial state. (Usually to be chosen at random)
initial_state = 1
# This function returns all available actions in the state given as an argument
def available_actions(state):
current_state_row = R[state,]
av_act = np.where(current_state_row >= 0) [1]
return av_act
# Get available actions in the current state
available_act = available_actions(initial_state)
# This function chooses at random which action to be performed within the range of all the available actions.
def sample_next_action(available_actions_range):
next_action = int(np.random.choice(available_act, 1))
return next_action
#sample next action to be performed
action = sample_next_action(available_act)
# This function updates the Q matrix according to the path selected and the Q learning algorithm
def update(current_state, action, gamma):
max_index = np.where(Q[action,] == np.max(Q[action,]))[1]
if max_index.shape[0] > 1:
max_index = int(np.random.choice(max_index, size=1))
else:
max_index = int(max_index)
max_value = Q[action, max_index]
# Q learning formula
Q[current_state, action] = R[current_state, action] + gamma * max_value
# Update Q matrix
update(initial_state, action, gamma)
# Training
# Train over 10000 iterations. (Re-iterate the process above)
for i in range(10000):
current_state = np.random.randint(0, int(Q.shape[0]))
available_act = available_actions(current_state)
action = sample_next_action(available_act)
update(current_state, action, gamma)
# Normalize the trained Q matrix
print ("Trained Q matrix:")
print (Q / np.max(Q) * 100)
# Testing
# Goal state = 2
current_state = 1
steps = [current_state]
while current_state != 2:
next_step_index = np.where(Q[current_state,] == np.max(Q[current_state,]))[1]
if next_step_index.shape[0] > 1:
next_step_index = int(np.random.choice(next_step_index, size=1))
else:
next_step_index = int(next_step_index)
steps.append(next_step_index)
current_state = next_step_index
# Print selected sequence of steps
print("Selected path:")
print(steps)
但我總是有這個我不明白的錯誤:
46 current_state = np.random.randint(0, int(Q.shape[0])) 47 available_act = available_actions(current_state) ---> 48 action = sample_next_action(available_act) 49 update (current_state, action, gamma) 50
in sample_next_action(available_actions_range) 19 # 這個 function 在所有可用動作的范圍內隨機選擇要執行的動作。 20 def sample_next_action(available_actions_range): ---> 21 next_action = int(np.random.choice(available_act, 1)) 22 return next_action 23
mtrand.pyx 在 mtrand.RandomState.choice()
ValueError: 'a' 不能為空,除非沒有采樣
請有任何幫助!
代碼有很多缺陷:
將 R 和 Q 的數據結構更改為:
R = np.array ... Q = np.zeros([9, 9])
更改 state 3 和 state 8 的 R 矩陣,以便至少有一個操作可用。 因此,只需在這些行中添加一個大於零的值。
將 available_actions 定義更改為:
def available_actions(state): current_state_row = R[state, :] av_act = np.where(current_state_row >= 0)[0]
更改第 39 行以進行正確索引
max_index = np.where(Q[action,] == np.max(Q[action, :]))[0]
更改第 73 行以進行正確索引
next_step_index = np.where(Q[current_state,:] == np.max(Q[current_state,:]))[0]
通過這些更改,您應該能夠獲得價值。
最終結果將是:
所選路徑:[1, 2]
import numpy as np
R = np.array([[-1, 0, -1, -1, 0, -1, -1, -1, -1],
[-1, -1, 100, 0, -1, -1, -1, -1, -1],
[-1, -1, 100, -1, -1, -1, -1, -1, -1],
[-1, -1, -1, -1, -1, 0, -1, -1, -1],
[-1, -1, -1, -1, -1, 100, 0, -1, -1],
[-1, -1, -1, -1, -1, 100, -1, -1, -1],
[-1, -1, -1, -1, -1, -1, -1, 100, 0],
[-1, -1, -1, -1, -1, -1, -1, 100, -1],
[-1, -1, -1, -1, 0, -1, -1, -1, -1]])
# Q matrix
Q = np.zeros([9,9])
# Gamma (learning parameter)
gamma = 0.4
# Initial state. (Usually to be chosen at random)
initial_state = 1
# This function returns all available actions in the state given as an argument
def available_actions(state):
current_state_row = R[state, :]
av_act = np.where(current_state_row >= 0)[0]
return av_act
# Get available actions in the current state
available_act = available_actions(initial_state)
# This function chooses at random which action to be performed within the range of all the available actions.
def sample_next_action(available_actions_range):
next_action = int(np.random.choice(available_act, 1))
return next_action
#sample next action to be performed
action = sample_next_action(available_act)
# This function updates the Q matrix according to the path selected and the Q learning algorithm
def update(current_state, action, gamma):
max_index = np.where(Q[action,] == np.max(Q[action, :]))[0]
if max_index.shape[0] > 1:
max_index = int(np.random.choice(max_index, size=1))
else:
max_index = int(max_index)
max_value = Q[action, max_index]
# Q learning formula
Q[current_state, action] = R[current_state, action] + gamma * max_value
# Update Q matrix
update(initial_state, action, gamma)
# Training
# Train over 10000 iterations. (Re-iterate the process above)
for i in range(10000):
current_state = np.random.randint(0, int(Q.shape[0]))
available_act = available_actions(current_state)
action = sample_next_action(available_act)
update(current_state, action, gamma)
# Normalize the trained Q matrix
print ("Trained Q matrix:")
print (Q / np.max(Q) * 100)
# Testing
# Goal state = 2
current_state = 1
steps = [current_state]
while current_state != 2:
next_step_index = np.where(Q[current_state,:] == np.max(Q[current_state,:]))[0]
if next_step_index.shape[0] > 1:
next_step_index = int(np.random.choice(next_step_index, size=1))
else:
next_step_index = int(next_step_index)
steps.append(next_step_index)
current_state = next_step_index
# Print selected sequence of steps
print("Selected path:")
print(steps)
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.