tensorflow，我的 DQN FlappyBird 代理在训练中表现相对较好，但是在我在会话后加载训练好的模型后，它很糟糕

Question

My deep Q learning for flappy bird code is as follows:我对flappy Bird代码的深度Q学习如下：

import gym_ple
import gym
import os
import numpy as np
import cv2
from collections import deque
from datetime import datetime
import tensorflow as tf
from tensorflow.keras.layers import Dense, Conv2D, Flatten, InputLayer, MaxPooling2D, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import TensorBoard



gym_ple.main(200)
tf.compat.v1.enable_eager_execution()
env = gym.make('FlappyBird-v0') # ações: 1 nada; 0 pula
imagens = []


## Processamento da imagem
def image_processing(image):
    if np.count_nonzero(image) != 0:
        img = cv2.resize(image[:-100,50:,1],dsize=(30,51)) #pegamos somente a paleta verde de cores, recortamos a imagem e a redimensionamos
        return img/255

    else:
        return np.zeros((51,30))

## Aglomerar frames
def frame_stack(stacked_frames, state, is_new, n_stack):
    frame = image_processing(state)
    if is_new:
        stacked_frames = deque([np.zeros(frame.shape, dtype = int) for f in range(n_stack)], maxlen=n_stack)
        for f in range(n_stack):
            stacked_frames.append(frame)
        stacked_state = np.stack(stacked_frames, axis=2)
    else:
        stacked_frames.append(frame)
        stacked_state = np.stack(stacked_frames, axis=2)
    return stacked_state, stacked_frames

class Memory:
    def __init__(self,max_size):
        self.buffer = deque(maxlen = max_size)

    def add(self, experience):
        self.buffer.append(experience)

    def sample(self, batch_size):
        buffer_size = len(self.buffer)
        index = np.random.choice(range(buffer_size),batch_size,replace=False)
        return [self.buffer[i] for i in index]


class DQN:
    def __init__(self, lr, gamma, max_experiences, batch_size, memory,
                 input_dims=[51,30,4],actions=np.array([0,1])):

        self.memory = memory
        self.batch_size = batch_size
        self.actions = actions
        self.max_experiences = max_experiences
        self.num_actions = len(actions)
        self.input_dims = input_dims
        self.gamma = gamma
        self.lr = lr

        self.model = tf.keras.Sequential([
            InputLayer(input_shape=input_dims),
            Conv2D(64, kernel_size=(7,7), activation='relu'),
            MaxPooling2D(),
            Dropout(0.2),
            Conv2D(96, kernel_size=3, strides=2, activation='relu'),
            MaxPooling2D(),
            Dropout(0.2),
            Conv2D(128, kernel_size=2, strides=1, activation='relu'),
            Dropout(0.2),
            Flatten(),
            Dense(128, activation='linear'),
            Dense(64, activation='linear'),
            Dense(32, activation='linear'),
            Dense(len(actions), activation='linear')])

        self.model.compile(loss = 'mean_squared_error',
                           optimizer=Adam(learning_rate=self.lr),
                           metrics=['accuracy'])


    def act(self, s, episode_actions, actions=np.array([0,1]), prob_cair=70, 
            exploration=0.5, decay=0.999):
        if exploration < 0.01:
            exploration = 0.01
        if np.random.rand() < exploration:
            probs = [actions[1]]*prob_cair + [actions[0]]*(100-prob_cair)
            action = probs[np.random.randint(0,len(probs))]
        else:
            action = np.argmax(self.model.predict(s))
            action = tf.math.reduce_sum(action * tf.one_hot(self.actions, len(self.actions)), axis=1)
            action = np.array(action,dtype=int)[0]
            episode_actions.append(action)
        return action, episode_actions

    def train(self, TargetNet, cb):
        batch = self.memory.sample(self.batch_size)
        states = np.array([each[0] for each in batch], ndmin=3)
        next_states = np.array([each[3] for each in batch], ndmin=3)

        Qs_list = self.model.predict(states)
        Qs2_list = TargetNet.model.predict(next_states)

        X = []
        Y = []

        for index, (s, action , reward, s2, done) in enumerate(batch):

            if not done:
                max_future_q = np.max(Qs2_list[index])
                new_q = reward + self.gamma*max_future_q
            else:
                new_q = reward

            Qs = Qs_list[index]
            Qs[action] = new_q

            X.append(s)
            Y.append(Qs)       

        self.model.fit(x = np.array(X),
                       y = np.array(Y),
                       epochs = 1,
                       callbacks=cb,
                       verbose = 0)

    def copy_weights(self, TrainNet):
        TrainNet.model.set_weights(self.model.get_weights()) 





def main(training=True,
         render=False,
         lr=0.00025,
         gamma=0.99,
         batch_size=96,
         max_size=100000,
         n_stack=4,
         actions=[0,1],
         exploration = 1,
         nepisodes=50,
         npretrain=1000,
         max_steps=50000,
         load_path=''
         ):
    states_size=[51,30,n_stack]
    n_acoes = len(actions)
    ###SET UP DEEP Q NETWORK AND MEMORY
    memory = Memory(100000)
    TrainNet = DQN(lr, gamma,  100, batch_size, memory=memory, input_dims=states_size)
    TargetNet = DQN(lr, gamma, 100, batch_size, memory=memory, input_dims=states_size)

    ### TREINO
    if training:
        #checkpoint_path+=datetime.now().strftime("%Y%m%d-%H%M%S")

            ### PRÉPOPULANDO A MEMÓRIA
        s = env.reset()
        s, stacked_frames = frame_stack(None, s, True, n_stack)
        for i in range(npretrain):
            action = actions[np.random.randint(0,n_acoes)]
            s2, reward, done, _ = env.step(action)
            if reward > 0:
                reward*=10
            else:
                reward/5
            s2, stacked_frames = frame_stack(stacked_frames, s2, False, n_stack)
            if done:
                s2=np.zeros(s2.shape)
                memory.add((s,action,reward,s2,done))
                s = env.reset()
                s, stacked_frames = frame_stack(None, s, True, n_stack)
            else:
                memory.add((s, action, reward, s2, done))
                s = s2

        ### INÍCIO DO TREINO
        for episode in range(nepisodes):
            if episode == 49 or episode == 50:
                print(TrainNet.model.get_weights())                
            episode_actions = []
            exploration *= np.power(0.01, 2/(nepisodes*1))
            step=0
            episode_rewards=[]
            s=env.reset()
            s,stacked_frames=frame_stack(None,s,True,n_stack)

            while step<max_steps:
                action, episode_actions = TrainNet.act(np.array(s,ndmin=4), episode_actions, exploration= exploration)
                s2, reward, done, _ = env.step(action)
                if reward>0:
                    reward += 5
                if render:
                    env.render(mode='human')
                episode_rewards.append(reward)
                if done:
                    step=max_steps #sair do laço da linha 174
                    s2 = np.zeros(s2.shape)
                    s2, stacked_frames = frame_stack(None, s2, True, n_stack)
                    total_reward=sum(episode_rewards)
                    desc = np.count_nonzero(episode_actions)
                    sub = len(episode_actions) - desc
                    print(f'Episode::{episode},Rewards::{(total_reward):.2f},Probability of exploration::{(exploration):.4f}, Subiu {sub} vezes e desceu {desc} vezes')
                    memory.add((s,action,reward,s2,done))
                    TargetNet.copy_weights(TrainNet)
                else:

                    s2, stacked_frames = frame_stack(stacked_frames, s2, False, n_stack)
                    memory.add((s,action,reward,s2,done))
                    s = s2
                step+=1

                ### APRENDIZADO PROPRIAMENTE DITO
                TrainNet.train(TargetNet, None)



            if episode%10==0:
                TrainNet.model.save('treino/modelo.h5')
            if episode%100==0 and episode!=0:
                os.mkdir(f'treino/{episode}/')
                TrainNet.model.save(f'treino/{episode}/modelo.h5')
    else:
        TrainNet.model = tf.keras.models.load_model('treino/modelo.h5')
        for episode in range(10):
            s = env.reset()
            s, stacked_frames = frame_stack(None, s, True, n_stack)
            episode_rewards = []
            episode_actions = []
            igual = '='

            print("****************************************************")
            print("EPISODE ", episode)

            done = False
            while not done:

                action = np.argmax(TrainNet.model.predict(np.array(s,ndmin=4)))
                print(s, action)
                episode_actions.append(action)
                s2, reward, done, _ = env.step(action)

                episode_rewards.append(reward)
                env.render(mode='human')
                descidas = np.count_nonzero(episode_actions)*100//len(episode_actions)  ## Obtém cada ação do modelo e printa
                subidas = (len(episode_actions) - descidas)*100//len(episode_actions)   ## como um gráfico de barras

                s2, stack_frames = frame_stack(stacked_frames, s2, False, n_stack)
                s = s2

            print(f'Score     || {sum(episode_rewards)}')
            print(f'Subidas   || {igual*subidas}')
            print(f'Descidas  || {igual*descidas}')




if __name__ == "__main__":
    main(False,False)
    env.close()

however, when i set main to main(False, False) after a training session, the agent only goes up/down to every state, always dying.然而，当我在训练课程后将 main 设置为 main(False, False) 时，代理只会上升/下降到每个状态，总是死亡。 But in the training logs I can clearly see that he is often going through one or two pipes.但是在训练日志中我可以清楚地看到他经常通过一两个管道。

for this code i used the open-ai gym environment for flappy bird.对于这个代码，我使用了 open-ai 健身房环境来处理飞扬的鸟。 For it to run, the package "gym_ple" needs to be installed on the machine.要运行它，需要在机器上安装包“gym_ple”。 This gym environment gives the agent as a state the frame from the game.这个健身房环境将游戏中的框架作为状态提供给代理。 The agent has two moves: 0 to fly or 1 to do nothing.智能体有两个动作：0 飞行或 1 什么都不做。 I am still refining this script, so that's why it might look so rough.我仍在完善这个脚本，所以这就是它看起来如此粗糙的原因。

Training Rewards培训奖励

Testing Rewards测试奖励

Answer 1

At the very end of your main() function, you have a typo in the line s2, stack_frames = frame_stack(stacked_frames, s2, False, n_stack) .在main()函数的最后，您在s2, stack_frames = frame_stack(stacked_frames, s2, False, n_stack)行中有一个错字。 You have stack_frames instead of stacked_frames which results in your model never getting the updated stacked_frames .您有stack_frames而不是stacked_frames ，这导致您的模型永远不会获得更新的stacked_frames 。

tensorflow，我的 DQN FlappyBird 代理在训练中表现相对较好，但是在我在会话后加载训练好的模型后，它很糟糕

问题描述

1 个解决方案

解决方案1
0 2020-03-23 11:14:02

tensorflow，我的 DQN FlappyBird 代理在训练中表现相对较好，但是在我在会话后加载训练好的模型后，它很糟糕

问题描述

1 个解决方案

解决方案1 0 2020-03-23 11:14:02

解决方案1
0 2020-03-23 11:14:02