I need to use GameExperience.get_data to retrieve training data (input and target) and pass to model.fit method to train the model. I can call model.evaluate to determine loss.
Treasurehunt.ipynb
from __future__ import print_function
import os, sys, time, datetime, json, random
import numpy as np
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.optimizers import SGD , Adam, RMSprop
from keras.layers.advanced_activations import PReLU
import matplotlib.pyplot as plt
from TreasureMaze import TreasureMaze
from GameExperience import GameExperience
%matplotlib inline
maze = np.array([
[ 1., 0., 1., 1., 1., 1., 1., 1.],
[ 1., 0., 1., 1., 1., 0., 1., 1.],
[ 1., 1., 1., 1., 0., 1., 0., 1.],
[ 1., 1., 1., 0., 1., 1., 1., 1.],
[ 1., 1., 0., 1., 1., 1., 1., 1.],
[ 1., 1., 1., 0., 1., 0., 0., 0.],
[ 1., 1., 1., 0., 1., 1., 1., 1.],
[ 1., 1., 1., 1., 0., 1., 1., 1.]
])
def show(qmaze):
plt.grid('on')
nrows, ncols = qmaze.maze.shape
ax = plt.gca()
ax.set_xticks(np.arange(0.5, nrows, 1))
ax.set_yticks(np.arange(0.5, ncols, 1))
ax.set_xticklabels([])
ax.set_yticklabels([])
canvas = np.copy(qmaze.maze)
for row,col in qmaze.visited:
canvas[row,col] = 0.6
pirate_row, pirate_col, _ = qmaze.state
canvas[pirate_row, pirate_col] = 0.3 # pirate cell
canvas[nrows-1, ncols-1] = 0.9 # treasure cell
img = plt.imshow(canvas, interpolation='none', cmap='gray')
return img
LEFT = 0
UP = 1
RIGHT = 2
DOWN = 3
# Exploration factor
epsilon = 0.1
# Actions dictionary
actions_dict = {
LEFT: 'left',
UP: 'up',
RIGHT: 'right',
DOWN: 'down',
}
num_actions = len(actions_dict)
qmaze = TreasureMaze(maze)
canvas, reward, game_over = qmaze.act(DOWN)
print("reward=", reward)
print(game_over)
show(qmaze)
def play_game(model, qmaze, pirate_cell):
qmaze.reset(pirate_cell)
envstate = qmaze.observe()
while True:
prev_envstate = envstate
# get next action
q = model.predict(prev_envstate)
action = np.argmax(q[0])
# apply action, get rewards and new state
envstate, reward, game_status = qmaze.act(action)
if game_status == 'win':
return True
elif game_status == 'lose':
return False
def completion_check(model, qmaze):
for cell in qmaze.free_cells:
if not qmaze.valid_actions(cell):
return False
if not play_game(model, qmaze, cell):
return False
return True
def build_model(maze):
model = Sequential()
model.add(Dense(maze.size, input_shape=(maze.size,)))
model.add(PReLU())
model.add(Dense(maze.size))
model.add(PReLU())
model.add(Dense(num_actions))
model.compile(optimizer='adam', loss='mse')
return model
def qtrain(model, maze, **opt):
# exploration factor
global epsilon
# number of epochs
n_epoch = opt.get('n_epoch', 15000)
# maximum memory to store episodes
max_memory = opt.get('max_memory', 1000)
# maximum data size for training
data_size = opt.get('data_size', 50)
# start time
start_time = datetime.datetime.now()
# Construct environment/game from numpy array: maze (see above)
qmaze = TreasureMaze(maze)
# Initialize experience replay object
experience = GameExperience(model, max_memory=max_memory)
win_history = [] # history of win/lose game
hsize = qmaze.maze.size//2 # history window size
win_rate = 0.0
#solution starts here
for i in range(n_epoch):
print("For loop started...")
Agent_cell = random.choice(qmaze.free_cells)
qmaze.reset(Agent_cell)
envstate = qmaze.observe()
game_status = 'not_over'
while game_status=='not_over':
print("While loop started...")
prev_envstate = envstate
q = model.predict(prev_envstate)
action = random.choice(actions_dict)
envstate, reward, game_status = qmaze.act(action)
actionInt=list(actions_dict.keys())[list(actions_dict.values()).index(action)]
episode = [prev_envstate, actionInt, reward, envstate, game_status]
experience.remember(episode)
inputs,targets = experience.get_data()
model.fit(inputs, targets)
#win_rate = model.evaluate(inputs, targets)
#print(model.evaluate(inputs, targets))
print(game_status)
print("While loop ended")
#print("While ended. executing rest of Fr loop...")
if win_rate > 0.9 and completion_check(model, qmaze):
epoch = i
print(i)
#print("For loop ended...")
# pseudocode:
# For each epoch:
# Agent_cell = randomly select a free cell
# Reset the maze with agent set to above position
# Hint: Review the reset method in the TreasureMaze.py class.
# envstate = Environment.current_state
# Hint: Review the observe method in the TreasureMaze.py class.
# While state is not game over:
# previous_envstate = envstate
# Action = randomly choose action (left, right, up, down) either by exploration or by exploitation
# envstate, reward, game_status = qmaze.act(action)
# Hint: Review the act method in the TreasureMaze.py class.
# episode = [previous_envstate, action, reward, envstate, game_status]
# Store episode in Experience replay object
# Hint: Review the remember method in the GameExperience.py class.
# Train neural network model and evaluate loss
# Hint: Call GameExperience.get_data to retrieve training data (input and target) and pass to model.fit method
# to train the model. You can call model.evaluate to determine loss.
# If the win rate is above the threshold and your model passes the completion check, that would be your epoch.
#Print the epoch, loss, episodes, win count, and win rate for each epoch
dt = datetime.datetime.now() - start_time
t = format_time(dt.total_seconds())
template = "Epoch: {:03d}/{:d} | Loss: {:.4f} | Episodes: {:d} | Win count: {:d} | Win rate: {:.3f} | time: {}"
print(template.format(epoch, n_epoch-1, loss, n_episodes, sum(win_history), win_rate, t))
# We simply check if training has exhausted all free cells and if in all
# cases the agent won.
if win_rate > 0.9 : epsilon = 0.05
if sum(win_history[-hsize:]) == hsize and completion_check(model, qmaze):
print("Reached 100%% win rate at epoch: %d" % (epoch,))
break
# Determine the total time for training
dt = datetime.datetime.now() - start_time
seconds = dt.total_seconds()
t = format_time(seconds)
print("n_epoch: %d, max_mem: %d, data: %d, time: %s" % (epoch, max_memory, data_size, t))
return seconds
# This is a small utility for printing readable time strings:
def format_time(seconds):
if seconds < 400:
s = float(seconds)
return "%.1f seconds" % (s,)
elif seconds < 4000:
m = seconds / 60.0
return "%.2f minutes" % (m,)
else:
h = seconds / 3600.0
return "%.2f hours" % (h,)
qmaze = TreasureMaze(maze)
show(qmaze)
model = build_model(maze)
#qtrain(model, maze, epochs=2, max_memory=8*maze.size, data_size=32)
qtrain(model, maze, n_epoch=2, max_memory=8*maze.size, data_size=32)
#qtrain(model, maze, 2, 8*maze.size, 32)
print(completion_check(model, qmaze))
show(qmaze)
pirate_start = (0, 0)
play_game(model, qmaze, pirate_start)
show(qmaze)
TreasureMaze.py
# This class represents the environment, which includes a maze object defined as a matrix.
import numpy as np
visited_mark = 0.8 # The visited cells are marked by an 80% gray shade.
pirate_mark = 0.5 # The current cell where the pirate is located is marked by a 50% gray shade.
# The agent can move in one of four directions.
LEFT = 0
UP = 1
RIGHT = 2
DOWN = 3
class TreasureMaze(object):
# The maze is a two-dimensional Numpy array of floats between 0.0 and 1.0.
# 1.0 corresponds to a free cell and 0.0 to an occupied cell.
# pirate = (row, col) initial pirate position (defaults to (0,0))
def __init__(self, maze, pirate=(0,0)):
self._maze = np.array(maze)
nrows, ncols = self._maze.shape
self.target = (nrows-1, ncols-1) # target cell where the "treasure" is
self.free_cells = [(r,c) for r in range(nrows) for c in range(ncols) if self._maze[r,c] == 1.0]
self.free_cells.remove(self.target)
if self._maze[self.target] == 0.0:
raise Exception("Invalid maze: target cell cannot be blocked!")
if not pirate in self.free_cells:
raise Exception("Invalid Pirate Location: must sit on a free cell")
self.reset(pirate)
# This method resets the pirate's position.
def reset(self, pirate):
self.pirate = pirate
self.maze = np.copy(self._maze)
nrows, ncols = self.maze.shape
row, col = pirate
self.maze[row, col] = pirate_mark
self.state = (row, col, 'start')
# To prevent the game from running excessively long, a minimum reward is defined.
self.min_reward = -0.5 * self.maze.size
self.total_reward = 0
self.visited = set()
# This method updates the state based on agent movement (valid, invalid, or blocked).
def update_state(self, action):
nrows, ncols = self.maze.shape
nrow, ncol, nmode = pirate_row, pirate_col, mode = self.state
if self.maze[pirate_row, pirate_col] > 0.0:
self.visited.add((pirate_row, pirate_col)) # marks a visited cell
valid_actions = self.valid_actions()
if not valid_actions:
nmode = 'blocked'
elif action in valid_actions:
nmode = 'valid'
if action == LEFT:
ncol -= 1
elif action == UP:
nrow -= 1
if action == RIGHT:
ncol += 1
elif action == DOWN:
nrow += 1
else:
mode = 'invalid' # invalid action, no change in pirate position
# New state
self.state = (nrow, ncol, nmode)
# This method returns a reward based on the agent movement guidelines.
# The agent will be rewarded with positive or negative points, ranging from -1 to 1, for every movement.
# The highest reward is granted when the agent reaches the treasure cell.
# If the agent hits an occupied cell or attempts to go outside the maze boundary, it will incur the highest penalty.
# A penalty is also applied when the agent tries to revisit a cell, to prevent wandering within free cells.
def get_reward(self):
pirate_row, pirate_col, mode = self.state
nrows, ncols = self.maze.shape
if pirate_row == nrows-1 and pirate_col == ncols-1:
return 1.0
if mode == 'blocked':
return self.min_reward - 1
if (pirate_row, pirate_col) in self.visited:
return -0.25
if mode == 'invalid':
return -0.75
if mode == 'valid':
return -0.04
# This method keeps track of the state and total reward based on agent action.
def act(self, action):
self.update_state(action)
reward = self.get_reward()
self.total_reward += reward
status = self.game_status()
envstate = self.observe()
return envstate, reward, status
# This method returns the current environment state.
def observe(self):
canvas = self.draw_env()
envstate = canvas.reshape((1, -1))
return envstate
# To help with visualization, this class includes a draw method to visualize the cells.
# Free cells are marked with white and occupied cells with black.
def draw_env(self):
canvas = np.copy(self.maze)
nrows, ncols = self.maze.shape
# clear all visual marks
for r in range(nrows):
for c in range(ncols):
if canvas[r,c] > 0.0:
canvas[r,c] = 1.0
# draw the pirate
row, col, valid = self.state
canvas[row, col] = pirate_mark
return canvas
# This method returns the game status.
def game_status(self):
# If the agent’s total reward goes below the minimum reward, the game is over.
if self.total_reward < self.min_reward:
return 'lose'
pirate_row, pirate_col, mode = self.state
nrows, ncols = self.maze.shape
# If the agent reaches the treasure cell, the game is won.
if pirate_row == nrows-1 and pirate_col == ncols-1:
return 'win'
# Game is not complete yet
return 'not_over'
# This method returns the set of valid actions starting from the current cell.
def valid_actions(self, cell=None):
if cell is None:
row, col, mode = self.state
else:
row, col = cell
actions = [0, 1, 2, 3]
nrows, ncols = self.maze.shape
if row == 0:
actions.remove(1)
elif row == nrows-1:
actions.remove(3)
if col == 0:
actions.remove(0)
elif col == ncols-1:
actions.remove(2)
if row>0 and self.maze[row-1,col] == 0.0:
actions.remove(1)
if row<nrows-1 and self.maze[row+1,col] == 0.0:
actions.remove(3)
if col>0 and self.maze[row,col-1] == 0.0:
actions.remove(0)
if col<ncols-1 and self.maze[row,col+1] == 0.0:
actions.remove(2)
return actions
GameExperience.py
# This class stores the episodes, all the states that come in between the initial state and the terminal state.
# This is later used by the agent for learning by experience, called "exploration".
import numpy as np
class GameExperience(object):
# model = neural network model
# max_memory = number of episodes to keep in memory. The oldest episode is deleted to make room for a new episode.
# discount = discount factor; determines the importance of future rewards vs. immediate rewards
def __init__(self, model, max_memory=100, discount=0.95):
self.model = model
self.max_memory = max_memory
self.discount = discount
self.memory = list()
self.num_actions = model.output_shape[-1]
# Stores episodes in memory
def remember(self, episode):
# episode = [envstate, action, reward, envstate_next, game_over]
# memory[i] = episode
# envstate == flattened 1d maze cells info, including pirate cell (see method: observe)
self.memory.append(episode)
if len(self.memory) > self.max_memory:
del self.memory[0]
# Predicts the next action based on the current environment state
def predict(self, envstate):
return self.model.predict(envstate)[0]
# Returns input and targets from memory, defaults to data size of 10
def get_data(self, data_size=10):
env_size = self.memory[0][0].shape[1] # envstate 1d size (1st element of episode)
mem_size = len(self.memory)
data_size = min(mem_size, data_size)
inputs = np.zeros((data_size, env_size))
targets = np.zeros((data_size, self.num_actions))
for i, j in enumerate(np.random.choice(range(mem_size), data_size, replace=False)):
envstate, action, reward, envstate_next, game_over = self.memory[j]
inputs[i] = envstate
# There should be no target values for actions not taken.
targets[i] = self.predict(envstate)
# Q_sa = derived policy = max quality env/action = max_a' Q(s', a')
Q_sa = np.max(self.predict(envstate_next))
if game_over:
targets[i, action] = reward
else:
# reward + gamma * max_a' Q(s', a')
targets[i, action] = reward + self.discount * Q_sa
return inputs, targets
My code prints 1/1 [==============================] - 0s 397ms/step - loss: 0.0619
when it runs the model.fit()
method inside the while loop. I want to avoid this and I should use that method correctly. Please see the qtrain
method.
Ive read through your code and redone it on Cocalc
first of all this section has an error:
Ive read through your code and redone it on Cocalc
first of all this section has an error:
print("n_epoch: %d, max_mem: %d, data: %d, time: %s" % (epoch, max_memory, data_size, t))
return seconds
it should be:
print("n_epoch: %d, max_mem: %d, data: %d, time: %s" % (n_epoch, max_memory, data_size, t))
return seconds
this should return the loss the plotted axes and the pirates path.
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.