简体   繁体   中英

How to use the model.fit() method for deep Q-leaning Treasure Hunt Game in keras.models Jupyter Python

I need to use GameExperience.get_data to retrieve training data (input and target) and pass to model.fit method to train the model. I can call model.evaluate to determine loss.

Treasurehunt.ipynb


    from __future__ import print_function
    import os, sys, time, datetime, json, random
    import numpy as np
    from keras.models import Sequential
    from keras.layers.core import Dense, Activation
    from keras.optimizers import SGD , Adam, RMSprop
    from keras.layers.advanced_activations import PReLU
    import matplotlib.pyplot as plt
    from TreasureMaze import TreasureMaze
    from GameExperience import GameExperience
    %matplotlib inline
    
    maze = np.array([
        [ 1.,  0.,  1.,  1.,  1.,  1.,  1.,  1.],
        [ 1.,  0.,  1.,  1.,  1.,  0.,  1.,  1.],
        [ 1.,  1.,  1.,  1.,  0.,  1.,  0.,  1.],
        [ 1.,  1.,  1.,  0.,  1.,  1.,  1.,  1.],
        [ 1.,  1.,  0.,  1.,  1.,  1.,  1.,  1.],
        [ 1.,  1.,  1.,  0.,  1.,  0.,  0.,  0.],
        [ 1.,  1.,  1.,  0.,  1.,  1.,  1.,  1.],
        [ 1.,  1.,  1.,  1.,  0.,  1.,  1.,  1.]
    ])
    
    def show(qmaze):
        plt.grid('on')
        nrows, ncols = qmaze.maze.shape
        ax = plt.gca()
        ax.set_xticks(np.arange(0.5, nrows, 1))
        ax.set_yticks(np.arange(0.5, ncols, 1))
        ax.set_xticklabels([])
        ax.set_yticklabels([])
        canvas = np.copy(qmaze.maze)
        for row,col in qmaze.visited:
            canvas[row,col] = 0.6
        pirate_row, pirate_col, _ = qmaze.state
        canvas[pirate_row, pirate_col] = 0.3   # pirate cell
        canvas[nrows-1, ncols-1] = 0.9 # treasure cell
        img = plt.imshow(canvas, interpolation='none', cmap='gray')
        return img
    
    LEFT = 0
    UP = 1
    RIGHT = 2
    DOWN = 3
    
    
    # Exploration factor
    epsilon = 0.1
    
    # Actions dictionary
    actions_dict = {
        LEFT: 'left',
        UP: 'up',
        RIGHT: 'right',
        DOWN: 'down',
    }
    
    num_actions = len(actions_dict)
    
    qmaze = TreasureMaze(maze)
    canvas, reward, game_over = qmaze.act(DOWN)
    print("reward=", reward)
    print(game_over)
    show(qmaze)
    
    def play_game(model, qmaze, pirate_cell):
        qmaze.reset(pirate_cell)
        envstate = qmaze.observe()
        while True:
            prev_envstate = envstate
            # get next action
            q = model.predict(prev_envstate)
            action = np.argmax(q[0])
    
            # apply action, get rewards and new state
            envstate, reward, game_status = qmaze.act(action)
            if game_status == 'win':
                return True
            elif game_status == 'lose':
                return False
    
    def completion_check(model, qmaze):
        for cell in qmaze.free_cells:
            if not qmaze.valid_actions(cell):
                return False
            if not play_game(model, qmaze, cell):
                return False
        return True
    
    def build_model(maze):
        model = Sequential()
        model.add(Dense(maze.size, input_shape=(maze.size,)))
        model.add(PReLU())
        model.add(Dense(maze.size))
        model.add(PReLU())
        model.add(Dense(num_actions))
        model.compile(optimizer='adam', loss='mse')
        return model
    
    def qtrain(model, maze, **opt):
    
        # exploration factor
        global epsilon 
    
        # number of epochs
        n_epoch = opt.get('n_epoch', 15000)
    
        # maximum memory to store episodes
        max_memory = opt.get('max_memory', 1000)
    
        # maximum data size for training
        data_size = opt.get('data_size', 50)
    
        # start time
        start_time = datetime.datetime.now()
    
        # Construct environment/game from numpy array: maze (see above)
        qmaze = TreasureMaze(maze)
    
        # Initialize experience replay object
        experience = GameExperience(model, max_memory=max_memory)
        
        win_history = []   # history of win/lose game
        hsize = qmaze.maze.size//2   # history window size
        win_rate = 0.0
    
        #solution starts here
        for i in range(n_epoch):
          print("For loop started...")
          Agent_cell = random.choice(qmaze.free_cells)
          qmaze.reset(Agent_cell)
          envstate = qmaze.observe()
          
          game_status = 'not_over'
          while game_status=='not_over':
            print("While loop started...")
            prev_envstate = envstate
            q = model.predict(prev_envstate)
            action = random.choice(actions_dict)
            envstate, reward, game_status = qmaze.act(action)
            actionInt=list(actions_dict.keys())[list(actions_dict.values()).index(action)]
            episode = [prev_envstate, actionInt, reward, envstate, game_status]
            experience.remember(episode)
            inputs,targets = experience.get_data()
            model.fit(inputs, targets)
            #win_rate = model.evaluate(inputs, targets)
            #print(model.evaluate(inputs, targets))
            print(game_status)
            print("While loop ended")
          #print("While ended. executing rest of Fr loop...")
          if win_rate > 0.9 and completion_check(model, qmaze):
            epoch = i
            print(i)
          #print("For loop ended...")
    
        
        # pseudocode:
        # For each epoch:
        #    Agent_cell = randomly select a free cell
        #    Reset the maze with agent set to above position
        #    Hint: Review the reset method in the TreasureMaze.py class.
        #    envstate = Environment.current_state
        #    Hint: Review the observe method in the TreasureMaze.py class.
        #    While state is not game over:
        #        previous_envstate = envstate
        #        Action = randomly choose action (left, right, up, down) either by exploration or by exploitation
        #        envstate, reward, game_status = qmaze.act(action)
        #    Hint: Review the act method in the TreasureMaze.py class.
        #        episode = [previous_envstate, action, reward, envstate, game_status]
        #        Store episode in Experience replay object
        #    Hint: Review the remember method in the GameExperience.py class.
        #        Train neural network model and evaluate loss
        #    Hint: Call GameExperience.get_data to retrieve training data (input and target) and pass to model.fit method 
        #          to train the model. You can call model.evaluate to determine loss.
        #    If the win rate is above the threshold and your model passes the completion check, that would be your epoch.
    
    
        #Print the epoch, loss, episodes, win count, and win rate for each epoch
            dt = datetime.datetime.now() - start_time
            t = format_time(dt.total_seconds())
            template = "Epoch: {:03d}/{:d} | Loss: {:.4f} | Episodes: {:d} | Win count: {:d} | Win rate: {:.3f} | time: {}"
            print(template.format(epoch, n_epoch-1, loss, n_episodes, sum(win_history), win_rate, t))
            # We simply check if training has exhausted all free cells and if in all
            # cases the agent won.
            if win_rate > 0.9 : epsilon = 0.05
            if sum(win_history[-hsize:]) == hsize and completion_check(model, qmaze):
                print("Reached 100%% win rate at epoch: %d" % (epoch,))
                break
        
        # Determine the total time for training
        dt = datetime.datetime.now() - start_time
        seconds = dt.total_seconds()
        t = format_time(seconds)
    
        print("n_epoch: %d, max_mem: %d, data: %d, time: %s" % (epoch, max_memory, data_size, t))
        return seconds
    
    # This is a small utility for printing readable time strings:
    def format_time(seconds):
        if seconds < 400:
            s = float(seconds)
            return "%.1f seconds" % (s,)
        elif seconds < 4000:
            m = seconds / 60.0
            return "%.2f minutes" % (m,)
        else:
            h = seconds / 3600.0
            return "%.2f hours" % (h,)
    
    qmaze = TreasureMaze(maze)
    show(qmaze)
    
    model = build_model(maze)
    #qtrain(model, maze, epochs=2, max_memory=8*maze.size, data_size=32)
    qtrain(model, maze, n_epoch=2, max_memory=8*maze.size, data_size=32)
    #qtrain(model, maze, 2, 8*maze.size, 32)
    
    print(completion_check(model, qmaze))
    show(qmaze)
    
    pirate_start = (0, 0)
    play_game(model, qmaze, pirate_start)
    show(qmaze)

TreasureMaze.py


    # This class represents the environment, which includes a maze object defined as a matrix. 
    
    import numpy as np
    
    visited_mark = 0.8  # The visited cells are marked by an 80% gray shade.
    pirate_mark = 0.5   # The current cell where the pirate is located is marked by a 50% gray shade.
    
    # The agent can move in one of four directions.
    LEFT = 0
    UP = 1
    RIGHT = 2
    DOWN = 3
    
    class TreasureMaze(object):
    
        # The maze is a two-dimensional Numpy array of floats between 0.0 and 1.0.
        # 1.0 corresponds to a free cell and 0.0 to an occupied cell.
        # pirate = (row, col) initial pirate position (defaults to (0,0))
    
        def __init__(self, maze, pirate=(0,0)):
            self._maze = np.array(maze)
            nrows, ncols = self._maze.shape
            self.target = (nrows-1, ncols-1)   # target cell where the "treasure" is
            self.free_cells = [(r,c) for r in range(nrows) for c in range(ncols) if self._maze[r,c] == 1.0]
            self.free_cells.remove(self.target)
            if self._maze[self.target] == 0.0:
                raise Exception("Invalid maze: target cell cannot be blocked!")
            if not pirate in self.free_cells:
                raise Exception("Invalid Pirate Location: must sit on a free cell")
            self.reset(pirate)
    
        # This method resets the pirate's position.
        
        def reset(self, pirate):
            self.pirate = pirate
            self.maze = np.copy(self._maze)
            nrows, ncols = self.maze.shape
            row, col = pirate
            self.maze[row, col] = pirate_mark
            self.state = (row, col, 'start')
            # To prevent the game from running excessively long, a minimum reward is defined.
            self.min_reward = -0.5 * self.maze.size
            self.total_reward = 0
            self.visited = set()
    
        # This method updates the state based on agent movement (valid, invalid, or blocked).
        
        def update_state(self, action):
            nrows, ncols = self.maze.shape
            nrow, ncol, nmode = pirate_row, pirate_col, mode = self.state
    
            if self.maze[pirate_row, pirate_col] > 0.0:
                self.visited.add((pirate_row, pirate_col))  # marks a visited cell
    
            valid_actions = self.valid_actions()
                    
            if not valid_actions:
                nmode = 'blocked'
            elif action in valid_actions:
                nmode = 'valid'
                if action == LEFT:
                    ncol -= 1
                elif action == UP:
                    nrow -= 1
                if action == RIGHT:
                    ncol += 1
                elif action == DOWN:
                    nrow += 1
            else:                  
                mode = 'invalid' # invalid action, no change in pirate position
    
            # New state
            self.state = (nrow, ncol, nmode)
    
        # This method returns a reward based on the agent movement guidelines.
        # The agent will be rewarded with positive or negative points, ranging from -1 to 1, for every movement. 
        # The highest reward is granted when the agent reaches the treasure cell. 
        # If the agent hits an occupied cell or attempts to go outside the maze boundary, it will incur the highest penalty. 
        # A penalty is also applied when the agent tries to revisit a cell, to prevent wandering within free cells. 
        
        def get_reward(self):
            pirate_row, pirate_col, mode = self.state
            nrows, ncols = self.maze.shape
            if pirate_row == nrows-1 and pirate_col == ncols-1:
                return 1.0
            if mode == 'blocked':
                return self.min_reward - 1
            if (pirate_row, pirate_col) in self.visited:
                return -0.25
            if mode == 'invalid':
                return -0.75
            if mode == 'valid':
                return -0.04
    
        # This method keeps track of the state and total reward based on agent action.
    
        def act(self,  action):
            self.update_state(action)
            reward = self.get_reward()
            self.total_reward += reward
            status = self.game_status()
            envstate = self.observe()
            return envstate, reward, status
    
        # This method returns the current environment state.
        
        def observe(self):
            canvas = self.draw_env()
            envstate = canvas.reshape((1, -1))
            return envstate
    
        # To help with visualization, this class includes a draw method to visualize the cells. 
        # Free cells are marked with white and occupied cells with black. 
    
        def draw_env(self):
            canvas = np.copy(self.maze)
            nrows, ncols = self.maze.shape
            # clear all visual marks
            for r in range(nrows):
                for c in range(ncols):
                    if canvas[r,c] > 0.0:
                        canvas[r,c] = 1.0
            # draw the pirate
            row, col, valid = self.state
            canvas[row, col] = pirate_mark
            return canvas
    
        # This method returns the game status.
        
        def game_status(self):
            # If the agent’s total reward goes below the minimum reward, the game is over.
            if self.total_reward < self.min_reward:
                return 'lose'
            pirate_row, pirate_col, mode = self.state
            nrows, ncols = self.maze.shape
            # If the agent reaches the treasure cell, the game is won.
            if pirate_row == nrows-1 and pirate_col == ncols-1:
                return 'win'
    
            # Game is not complete yet
            return 'not_over'
    
        # This method returns the set of valid actions starting from the current cell.
        
        def valid_actions(self, cell=None):
            if cell is None:
                row, col, mode = self.state
            else:
                row, col = cell
            actions = [0, 1, 2, 3]
            nrows, ncols = self.maze.shape
            if row == 0:
                actions.remove(1)
            elif row == nrows-1:
                actions.remove(3)
    
            if col == 0:
                actions.remove(0)
            elif col == ncols-1:
                actions.remove(2)
    
            if row>0 and self.maze[row-1,col] == 0.0:
                actions.remove(1)
            if row<nrows-1 and self.maze[row+1,col] == 0.0:
                actions.remove(3)
    
            if col>0 and self.maze[row,col-1] == 0.0:
                actions.remove(0)
            if col<ncols-1 and self.maze[row,col+1] == 0.0:
                actions.remove(2)
    
            return actions


GameExperience.py


    # This class stores the episodes, all the states that come in between the initial state and the terminal state. 
    # This is later used by the agent for learning by experience, called "exploration". 
    
    import numpy as np
    
    class GameExperience(object):
        
        # model = neural network model
        # max_memory = number of episodes to keep in memory. The oldest episode is deleted to make room for a new episode.
        # discount = discount factor; determines the importance of future rewards vs. immediate rewards
        
        def __init__(self, model, max_memory=100, discount=0.95):
            self.model = model
            self.max_memory = max_memory
            self.discount = discount
            self.memory = list()
            self.num_actions = model.output_shape[-1]
        
        # Stores episodes in memory
        
        def remember(self, episode):
            # episode = [envstate, action, reward, envstate_next, game_over]
            # memory[i] = episode
            # envstate == flattened 1d maze cells info, including pirate cell (see method: observe)
            self.memory.append(episode)
            if len(self.memory) > self.max_memory:
                del self.memory[0]
    
        # Predicts the next action based on the current environment state        
        def predict(self, envstate):
            return self.model.predict(envstate)[0]
    
        # Returns input and targets from memory, defaults to data size of 10
        def get_data(self, data_size=10):
            env_size = self.memory[0][0].shape[1]   # envstate 1d size (1st element of episode)
            mem_size = len(self.memory)
            data_size = min(mem_size, data_size)
            inputs = np.zeros((data_size, env_size))
            targets = np.zeros((data_size, self.num_actions))
            for i, j in enumerate(np.random.choice(range(mem_size), data_size, replace=False)):
                envstate, action, reward, envstate_next, game_over = self.memory[j]
                inputs[i] = envstate
                # There should be no target values for actions not taken.
                targets[i] = self.predict(envstate)
                # Q_sa = derived policy = max quality env/action = max_a' Q(s', a')
                Q_sa = np.max(self.predict(envstate_next))
                if game_over:
                    targets[i, action] = reward
                else:
                    # reward + gamma * max_a' Q(s', a')
                    targets[i, action] = reward + self.discount * Q_sa
            return inputs, targets

My code prints 1/1 [==============================] - 0s 397ms/step - loss: 0.0619 when it runs the model.fit() method inside the while loop. I want to avoid this and I should use that method correctly. Please see the qtrain method.

Ive read through your code and redone it on Cocalc

first of all this section has an error:

Ive read through your code and redone it on Cocalc

first of all this section has an error:

print("n_epoch: %d, max_mem: %d, data: %d, time: %s" % (epoch, max_memory, data_size, t))
        return seconds

it should be:

print("n_epoch: %d, max_mem: %d, data: %d, time: %s" % (n_epoch, max_memory, data_size, t))
        return seconds

this should return the loss the plotted axes and the pirates path.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM