[英]Fail to implement layer normalization with keras
我试图在keras的完全连接的神经网络中实现层规范化 。 我遇到的问题是所有的损失都是NaN
而且它没有学习。 这是我的代码:
class DenseLN(Layer):
def __init__(self, output_dim, init='glorot_uniform', activation='linear', weights=None,
W_regularizer=None, b_regularizer=None, activity_regularizer=None,
W_constraint=None, b_constraint=None, bias=True, input_dim=None, **kwargs):
self.init = initializations.get(init)
self.activation = activations.get(activation)
self.output_dim = output_dim
self.input_dim = input_dim
self.epsilon = 1e-5
self.W_regularizer = regularizers.get(W_regularizer)
self.b_regularizer = regularizers.get(b_regularizer)
self.activity_regularizer = regularizers.get(activity_regularizer)
self.W_constraint = constraints.get(W_constraint)
self.b_constraint = constraints.get(b_constraint)
self.bias = bias
self.initial_weights = weights
self.input_spec = [InputSpec(ndim=2)]
if self.input_dim:
kwargs['input_shape'] = (self.input_dim,)
super(DenseLN, self).__init__(**kwargs)
def ln(self, x):
# layer normalization function
m = K.mean(x, axis=0)
std = K.sqrt(K.var(x, axis=0) + self.epsilon)
x_normed = (x - m) / (std + self.epsilon)
x_normed = self.gamma * x_normed + self.beta
return x_normed
def build(self, input_shape):
assert len(input_shape) == 2
input_dim = input_shape[1]
self.input_spec = [InputSpec(dtype=K.floatx(),
shape=(None, input_dim))]
self.gamma = K.variable(np.ones(self.output_dim) * 0.2, name='{}_gamma'.format(self.name))
self.beta = K.zeros((self.output_dim,), name='{}_beta'.format(self.name))
self.W = self.init((input_dim, self.output_dim),
name='{}_W'.format(self.name))
if self.bias:
self.b = K.zeros((self.output_dim,),
name='{}_b'.format(self.name))
self.trainable_weights = [self.W, self.gamma, self.beta, self.b]
else:
self.trainable_weights = [self.W, self.gamma, self.beta]
self.regularizers = []
if self.W_regularizer:
self.W_regularizer.set_param(self.W)
self.regularizers.append(self.W_regularizer)
if self.bias and self.b_regularizer:
self.b_regularizer.set_param(self.b)
self.regularizers.append(self.b_regularizer)
if self.activity_regularizer:
self.activity_regularizer.set_layer(self)
self.regularizers.append(self.activity_regularizer)
self.constraints = {}
if self.W_constraint:
self.constraints[self.W] = self.W_constraint
if self.bias and self.b_constraint:
self.constraints[self.b] = self.b_constraint
if self.initial_weights is not None:
self.set_weights(self.initial_weights)
del self.initial_weights
def call(self, x, mask=None):
output = K.dot(x, self.W)
output = self.ln(output)
#print (theano.tensor.shape(output))
if self.bias:
output += self.b
return self.activation(output)
def get_output_shape_for(self, input_shape):
assert input_shape and len(input_shape) == 2
return (input_shape[0], self.output_dim)
model = Sequential()
model.add(Dense(12, activation='sigmoid', input_dim=12))
model.add(DenseLN(98, activation='sigmoid'))
model.add(DenseLN(108, activation='sigmoid'))
model.add(DenseLN(1))
adadelta = Adadelta(lr=0.1, rho=0.95, epsilon=1e-08)
adagrad = Adagrad(lr=0.003, epsilon=1e-08)
model.compile(loss='poisson',
optimizer=adagrad,
metrics=['accuracy'])
model.fit(X_train_scale,
Y_train,
batch_size=3000,
callbacks=[history],
nb_epoch=300)
你知道这里有什么问题吗?我该如何解决? 提前致谢!
编辑:
我也试过了一些层的组合,发现了一些需要的东西。 如果输入和输出层都是正常Dense
层,则精度将非常低,几乎为零。 但是如果输入层是DenseLN
,即我的自定义层,则首先精度为0.6+
,经过数十次迭代后,它再次0.6+
为零。 实际上我从Dense
层复制了大部分代码,所有不同之处在于call
函数中的ln
函数和self.ln(output)
。 此外,我还在trainable_weights
添加了gamma
和beta
。
任何帮助表示赞赏!
如果将其作为单独的层实现,它会更清晰,更灵活。 这样的事情应该有效:
class LayerNorm(Layer):
""" Layer Normalization in the style of https://arxiv.org/abs/1607.06450 """
def __init__(self, scale_initializer='ones', bias_initializer='zeros', **kwargs):
super(LayerNorm, self).__init__(**kwargs)
self.epsilon = 1e-6
self.scale_initializer = initializers.get(scale_initializer)
self.bias_initializer = initializers.get(bias_initializer)
def build(self, input_shape):
self.scale = self.add_weight(shape=(input_shape[-1],),
initializer=self.scale_initializer,
trainable=True,
name='{}_scale'.format(self.name))
self.bias = self.add_weight(shape=(input_shape[-1],),
initializer=self.bias_initializer,
trainable=True,
name='{}_bias'.format(self.name))
self.built = True
def call(self, x, mask=None):
mean = K.mean(x, axis=-1, keepdims=True)
std = K.std(x, axis=-1, keepdims=True)
norm = (x - mean) * (1/(std + self.epsilon))
return norm * self.scale + self.bias
def compute_output_shape(self, input_shape):
return input_shape
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.