[英]Problem with implementing temporal difference based on actor-critic
I implemented a simple actor-critic model in Tensorflow==2.3.1 to learn Cartpole environment.我在 Tensorflow==2.3.1 中实现了一个简单的 actor-critic model 来学习 Cartpole 环境。 But it is not learning at all.
但它根本不是学习。 The average scores of every 50 episodes is below 20. Can someone please point out why the model isn't learning?
每50集的平均分数低于20。有人能指出为什么model不学习吗?
I based my algorithm on the following pseudocode:我的算法基于以下伪代码:
for every episode do
S <- starting state
for every step do
choose action A based on actor_policy P
take an action A
observe the reward R and new state S_new
calculate error E based on TD(0) schema V:
if new state S_new is not terminal then
E <- R + discount_factor * V(S_new) - V(S)
else
E <- R - V(S))
end if
calculate the loss for the critic: Lc <- E^2
calculate the loss for the actor: La <- -E * ln(P(A, S))
calculate overall loss: L <- Lc + La
update the weights with the gradient: w <- w + alpha * grad(w) * L
end for
end for
Discount factor
and alpha
are constant. Discount factor
和alpha
是恒定的。 Here are the requirements:以下是要求:
box2d==2.3.10
gym==0.17.3
keras==2.4.3
matplotlib==3.3.3
numpy==1.19.4
scikit-learn==0.23.2
tensorflow==2.3.1
tensorflow-probability==0.11.1
tqdm==4.53.0
And finally, my code is:最后,我的代码是:
from typing import Optional
import math
import gym
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp
from tqdm import tqdm
CART_POLE_ACTIONS = [0, 1]
CART_POLE_OUTPUT_LEN = len(CART_POLE_ACTIONS)
CART_POLE_INPUTS = 4
class ActorCriticController:
def __init__(self, environment, learning_rate: float, discount_factor: float, input_size: int,
output_size: int, h1_size: int, h2_size: int, actions: list):
self.environment = environment
self.discount_factor: float = discount_factor
self.input_size = input_size
self.output_size = output_size
self.h1_size = h1_size
self.h2_size = h2_size
self.actions = actions
self.optimizer: tf.keras.optimizers.Adam = tf.keras.optimizers.Adam(
learning_rate=learning_rate)
self.last_error_squared: float = 0.0
self.model: tf.keras.Model = self.create_actor_critic_model()
self.log_action_probability: Optional[tf.Tensor] = None
self.tape: Optional[tf.GradientTape] = None
def create_actor_critic_model(self) -> tf.keras.Model:
inputs = tf.keras.Input(shape=(self.input_size,))
hidden1 = tf.keras.layers.Dense(self.h1_size, activation='relu')(inputs)
hidden2 = tf.keras.layers.Dense(self.h2_size, activation='relu')(hidden1)
outputs_actor = tf.keras.layers.Dense(self.output_size, activation='softmax')(hidden2)
outputs_critic = tf.keras.layers.Dense(1, activation='linear')(hidden2)
model = tf.keras.Model(inputs=inputs, outputs=[outputs_actor, outputs_critic])
return model
def choose_action(self, state: np.ndarray) -> int:
state = self.format_state(state)
self.tape = tf.GradientTape()
with self.tape:
probs, _ = self.model(state)
action = tfp.distributions.Categorical(probs=probs).sample(1)
index = self.actions.index(int(action))
self.log_action_probability = math.log(probs[0][index], math.e)
return int(action)
def learn(self, state: np.ndarray, reward: float, new_state: np.ndarray, terminal: bool):
state = self.format_state(state)
new_state = self.format_state(new_state)
with self.tape:
_, critic_value = self.model(state)
_, new_critic_value = self.model(new_state)
error = reward - critic_value
if not terminal:
error += self.discount_factor * new_critic_value
self.last_error_squared = float(error) ** 2
loss = self.last_error_squared - error * self.log_action_probability
gradients = self.tape.gradient(loss, self.model.trainable_weights)
self.optimizer.apply_gradients(zip(gradients, self.model.trainable_weights))
@staticmethod
def format_state(state: np.ndarray) -> np.ndarray:
return np.reshape(state, (1, state.size))
def main() -> None:
environment = gym.make('CartPole-v1')
controller = ActorCriticController(environment=environment,
learning_rate=0.00001,
discount_factor=0.99,
input_size=CART_POLE_INPUTS, # 4
output_size=CART_POLE_OUTPUT_LEN, # 2
h1_size=128, # 1024, 128
h2_size=32, # 256, 32
actions = CART_POLE_ACTIONS)
past_rewards = []
past_errors = []
for i_episode in tqdm(range(2000)):
done = False
state = environment.reset()
reward_sum = 0.0
errors_history = []
while not done:
environment.render()
action = controller.choose_action(state)
new_state, reward, done, info = environment.step(action)
controller.learn(state, reward, new_state, done)
state = new_state
reward_sum += reward
errors_history.append(controller.last_error_squared)
print(f"reward_sum = {reward_sum}\n\n\n")
past_rewards.append(reward_sum)
past_errors.append(np.mean(errors_history))
window_size = 50
if i_episode % 25 == 0:
if len(past_rewards) >= window_size:
fig, axs = plt.subplots(2)
axs[0].plot(
[np.mean(past_errors[i:i + window_size]) for i in range(len(past_errors) - window_size)],
'tab:red',
)
axs[0].set_title('mean squared error')
axs[1].plot(
[np.mean(past_rewards[i:i+window_size]) for i in range(len(past_rewards) - window_size)],
'tab:green',
)
axs[1].set_title('sum of rewards')
plt.savefig(f'learning_{i_episode}.png')
plt.clf()
environment.close()
controller.model.save("final.model")
if __name__ == '__main__':
main()
Thank you in advance for your help.预先感谢您的帮助。 I hope someone will advise me:)
我希望有人能给我建议:)
I was able to fix your code.我能够修复您的代码。 Main changes:
主要变化:
math.log()
with tfp.distributions.Categorical.log_prob()
math.log()
替换 math.log tfp.distributions.Categorical.log_prob()
But I'm not entirely sure why it works this way, so further clarification is appreciated.但我不完全确定它为什么会这样工作,因此需要进一步澄清。
from typing import Optional
import gym
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp
from tqdm import tqdm
class ActorCriticController:
def __init__(self, environment, learning_rate: float, discount_factor: float, input_size: int,
output_size: int, h1_size: int, h2_size: int) -> None:
self.environment = environment
self.discount_factor: float = discount_factor
self.input_size = input_size
self.output_size = output_size
self.h1_size = h1_size
self.h2_size = h2_size
self.model: tf.keras.Model = self.create_actor_critic_model()
self.optimizer: tf.keras.optimizers.Adam = tf.keras.optimizers.Adam(lr=learning_rate)
self.log_action_probability: Optional[tf.Tensor] = None
self.tape: Optional[tf.GradientTape] = None
self.last_error_squared: float = 0.0
def create_actor_critic_model(self) -> tf.keras.Model:
inputs = tf.keras.Input(shape=(self.input_size,))
hidden1 = tf.keras.layers.Dense(self.h1_size, activation='relu')(inputs)
hidden2 = tf.keras.layers.Dense(self.h2_size, activation='relu')(hidden1)
outputs_actor = tf.keras.layers.Dense(self.output_size, activation='softmax')(hidden2)
outputs_critic = tf.keras.layers.Dense(1, activation='linear')(hidden2)
model = tf.keras.Model(inputs=inputs, outputs=[outputs_actor, outputs_critic])
return model
def choose_action(self, state: np.ndarray) -> int:
state = self.format_state(state)
self.tape = tf.GradientTape()
with self.tape:
probs, _ = self.model(state)
distribution = tfp.distributions.Categorical(probs=probs)
action = distribution.sample()
self.log_action_probability = distribution.log_prob(action)
return int(action)
# noinspection PyTypeChecker
def learn(self, state: np.ndarray, reward: float, new_state: np.ndarray, terminal: bool):
state = self.format_state(state)
new_state = self.format_state(new_state)
with self.tape:
if not terminal:
error = reward - self.model(state)[1] + self.discount_factor * \
self.model(new_state)[1]
else:
error = reward - self.model(state)[1]
self.last_error_squared = float(error) ** 2
actor_loss = - error * self.log_action_probability
critic_loss = error ** 2
loss = actor_loss + critic_loss
gradients = self.tape.gradient(loss, self.model.trainable_weights)
self.optimizer.apply_gradients(zip(gradients, self.model.trainable_weights))
@staticmethod
def format_state(state: np.ndarray) -> np.ndarray:
return np.reshape(state, (1, state.size))
def main() -> None:
environment = gym.make('CartPole-v1')
controller = ActorCriticController(environment=environment,
learning_rate=0.00006,
discount_factor=0.99,
input_size=4,
output_size=2,
h1_size=1024, # 1024, 128
h2_size=256) # 256, 32
past_rewards = []
past_errors = []
for i_episode in tqdm(range(2000)):
done = False
state = environment.reset()
reward_sum = 0.0
errors_history = []
while not done:
environment.render()
action = controller.choose_action(state)
new_state, reward, done, info = environment.step(action)
controller.learn(state, reward, new_state, done)
state = new_state
reward_sum += reward
errors_history.append(controller.last_error_squared)
print(f"reward_sum = {reward_sum}\n\n\n")
past_rewards.append(reward_sum)
past_errors.append(np.mean(errors_history))
if i_episode % 150 == 0:
controller.model.save(f"model.{i_episode}")
window_size = 30
if i_episode % 100 == 0:
if len(past_rewards) >= window_size:
fig, axs = plt.subplots(2)
axs[0].plot(
[np.mean(past_errors[i:i + window_size]) for i in
range(len(past_errors) - window_size)],
'tab:red',
)
axs[0].set_title('mean squared error')
axs[1].plot(
[np.mean(past_rewards[i:i + window_size]) for i in
range(len(past_rewards) - window_size)],
'tab:green',
)
axs[1].set_title('sum of rewards')
plt.savefig(f'plots/learning_{i_episode}.png')
plt.clf()
environment.close()
controller.model.save(r"final.model")
if __name__ == '__main__':
main()
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.