[英]Why is the loss nan in A2C?
我正在實施A2C model,其中 actor 損失正在變為nan
,而 model 沒有學習。 state 空間為5
,動作空間為51
。 有時,state 值在幾步后變得相同,有時變為zero
。 獎勵是一個浮動。 我已經嘗試更改超參數、優化器、激活 function、正則化和損失 function 但問題仍然存在。 示例 state 和損失變為 nan 的操作:
state=[-1.05, -0.99789203, 0, 7.21392952, 0] action=23 loss=nan reward=2.193713495633609。
import argparse
import os
from datetime import datetime
import keras
import tensorflow_probability as tfp
import gym
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import LeakyReLU
# Enable the memory growth option for the GPU
tf.config.experimental.set_memory_growth(tf.config.experimental.list_physical_devices('GPU')[0], True)
import var
from tensorflow.keras.layers import LeakyReLU
class Actor(tf.keras.Model):
def __init__(self,action_dim):
super().__init__()
self.d1 = tf.keras.layers.Dense(var.n_hidden1, activation='tanh', kernel_initializer=var.init)
self.bn1 = tf.keras.layers.BatchNormalization()
self.d2 = tf.keras.layers.Dense(var.n_hidden2, activation='tanh', kernel_initializer=var.init)
self.bn2 = tf.keras.layers.BatchNormalization()
self.d3 = tf.keras.layers.Dense(var.n_hidden3, activation='tanh', kernel_initializer=var.init)
self.bn3 = tf.keras.layers.BatchNormalization()
self.a = tf.keras.layers.Dense(action_dim, activation='sigmoid')
def call(self, input_data):
x = self.d1(input_data)
x = self.bn1(x)
x = self.d2(x)
x = self.bn2(x)
x = self.d3(x)
x = self.bn3(x)
a = self.a(x)
return a
class Critic(tf.keras.Model):
def __init__(self):
super().__init__()
self.d1 = tf.keras.layers.Dense(var.n_hidden1, activation='tanh', kernel_initializer=var.init)
self.bn1 = tf.keras.layers.BatchNormalization()
self.d2 = tf.keras.layers.Dense(var.n_hidden2, activation='tanh', kernel_initializer=var.init)
self.bn2 = tf.keras.layers.BatchNormalization()
self.d3 = tf.keras.layers.Dense(var.n_hidden3, activation='tanh', kernel_initializer=var.init)
self.bn3 = tf.keras.layers.BatchNormalization()
self.v = tf.keras.layers.Dense(1, activation=None)
def call(self, input_data):
x = self.d1(input_data)
x = self.bn1(x)
x = self.d2(x)
x = self.bn2(x)
x = self.d3(x)
x = self.bn3(x)
v = self.v(x)
return v
class Agent:
def __init__(self, state_dim, action_dim, action_bound, std_bound):
with tf.device("/device:XLA_GPU:0"):
self.state_dim = state_dim
self.action_dim = action_dim
self.action_bound = action_bound
self.std_bound = std_bound
self.gamma = var.gamma
self.a_opt = tf.optimizers.Adam(learning_rate=var.learning_rate)
self.c_opt = tf.optimizers.Adam(learning_rate=var.learning_rate_critic)
self.actor = Actor(self.action_dim)
self.critic = Critic()
def get_action(self, states):
probs = self.actor((np.array([states])), training=True)
action_probabilities = tf.nn.softmax(probs)
dist = tfp.distributions.Categorical(probs=action_probabilities, dtype=tf.float32)
actions = dist.sample()
return actions
def actor_loss(self, probs, actions, td):
dist = tfp.distributions.Categorical(probs=probs, dtype=tf.float32)
log_prob = dist.log_prob(actions)
loss = -log_prob* td
return loss
def update(self, states, actions, rewards, next_states):
with tf.GradientTape() as tape1, tf.GradientTape() as tape2:
probs = self.actor(states, training=True)
values = self.critic(states, training=True)
next_values = self.critic(next_states, training=True)
td = rewards + self.gamma * next_values - values
a_loss = self.actor_loss(probs, actions, td)
c_loss = tf.keras.losses.huber(td, values)
grads1 = tape1.gradient(a_loss, self.actor.trainable_variables)
grads2 = tape2.gradient(c_loss, self.critic.trainable_variables)
self.a_opt.apply_gradients(zip(grads1, self.actor.trainable_variables))
self.c_opt.apply_gradients(zip(grads2, self.critic.trainable_variables))
return a_loss + c_loss
def save(self, path):
checkpoint =tf.train.Checkpoint(critic_optimizer=self.a_opt, actor_optimizer=self.c_opt, critic=self.critic, actor=self.actor)
status = checkpoint.save(file_prefix=os.path.join(path, "ckpt"))
def restore(self, path):
checkpoint =tf.train.Checkpoint(critic_optimizer=self.a_opt, actor_optimizer=self.c_opt, critic=self.critic, actor=self.actor)
status = checkpoint.restore(tf.train.latest_checkpoint(path)).expect_partial()
if status is None:
raise ValueError('No checkpoint found')
我該如何解決這個問題?
這里絕對是狂野的價值觀。 這個想法可能非常無用,但我唯一一次看到這種結果是當我沒有縮放/規范化數據時,或者當我在一個(或多個)數據值(如 nan 或相對於該列/特征中的其他值的非常極端的值)。 只是在這里試一試——
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.