我的神经网络用直线逼近 X^2

Question

我目前正在尝试从头开始实现我自己的神经网络，以测试我对该方法的理解。 我认为事情进展顺利，因为我的网络设法逼近 AND 和 XOR 函数而没有问题，但事实证明它在学习逼近一个简单的正方形 function 时遇到了问题。

我尝试使用各种不同的网络配置，从 1 到 3 层，以及 1-64 个节点。 我将学习率从 0.1 更改为 0.00000001，并实施了权重衰减，因为我认为一些正则化可能会提供一些关于出了什么问题的见解。 我还实施了梯度检查，这给了我相互矛盾的答案，因为每次尝试的差异很大，从可怕的 0.6 差异到奇妙的 1e-10。 我使用泄漏的 ReLU 激活 function 和 MSE 作为我的成本 function。

有人可以帮我找出我缺少的东西吗？ 或者这纯粹是为了优化超参数？

我的代码如下：

import matplotlib.pyplot as plt
import numpy as np
import Sub_Script as ss

# Create sample data set using X**2

X = np.expand_dims(np.linspace(0, 1, 201), axis=0)
y = X**2

plt.plot(X.T, y.T)


# Hyper-parameters

layer_dims = [1, 64, 1]
learning_rate = 0.000001
iterations = 50000
decay = 0.00000001
num_ex = y.shape[1]


# Initializations

num_layers = len(layer_dims)
weights = [None] + [np.random.randn(layer_dims[l], layer_dims[l-1])*np.sqrt(2/layer_dims[l-1])for l in range(1, num_layers)]
biases = [None] + [np.zeros((layer_dims[l], 1)) for l in range(1, num_layers)]

dweights, dbiases, dw_approx, db_approx = ss.grad_check(weights, biases, num_layers, X, y, decay, num_ex)

# Main function: Iteration loop

for iter in range(iterations):
# Main function: Forward Propagation
z_values, acts = ss.forward_propagation(weights, biases, num_layers, X)
dweights, dbiases = ss.backward_propagation(weights, biases, num_layers, z_values, acts, y)
weights, biases = ss.update_paras(weights, biases, dweights, dbiases, learning_rate, decay, num_ex)

if iter % (1000+1) == 0:
    print('Cost: ', ss.mse(acts[-1], y, weights, decay, num_ex))


# Gradient Checking

dweights, dbiases, dw_approx, db_approx = ss.grad_check(weights, biases, num_layers, X, y, decay, num_ex)


# Visualization

plt.plot(X.T, acts[-1].T)

使用包含神经网络函数的 Sub_Script.py：

import numpy as np
import copy as cp

# Construct sub functions, forward, backward propagation and cost and activation functions
# Leaky ReLU Activation Function

def relu(x):
    return (x > 0) * x + (x < 0) * 0.01*x


# Leaky ReLU activation Function Gradient

def relu_grad(x):
    return (x > 0) + (x < 0) * 0.01


# MSE Cost Function

def mse(prediction, actual, weights, decay, num_ex):
    return np.sum((actual - prediction) ** 2)/(actual.shape[1]) + (decay/(2*num_ex))*np.sum([np.sum(w) for w in weights[1:]])


# MSE Cost Function Gradient

 def mse_grad(prediction, actual):
    return -2 * (actual - prediction)/(actual.shape[1])


# Forward Propagation

def forward_propagation(weights, biases, num_layers, act):
    acts = [[None] for i in range(num_layers)]
    z_values = [[None] for i in range(num_layers)]
    acts[0] = act

    for layer in range(1, num_layers):
        z_values[layer] = np.dot(weights[layer], acts[layer-1]) + biases[layer]
        acts[layer] = relu(z_values[layer])
    return z_values, acts


# Backward Propagation

def backward_propagation(weights, biases, num_layers, z_values, acts, y):
    dweights = [[None] for i in range(num_layers)]
    dbiases = [[None] for i in range(num_layers)]
    zgrad = mse_grad(acts[-1], y) * relu_grad(z_values[-1])
    dweights[-1] = np.dot(zgrad, acts[-2].T)
    dbiases[-1] = np.sum(zgrad, axis=1, keepdims=True)

    for layer in range(num_layers-2, 0, -1):
        zgrad = np.dot(weights[layer+1].T, zgrad) * relu_grad(z_values[layer])
        dweights[layer] = np.dot(zgrad, acts[layer-1].T)
        dbiases[layer] = np.sum(zgrad, axis=1, keepdims=True)

    return dweights, dbiases


# Update Parameters with Regularization

def update_paras(weights, biases, dweights, dbiases, learning_rate, decay, num_ex):
    weights = [None] + [w - learning_rate*(dw + (decay/num_ex)*w) for w, dw in zip(weights[1:], dweights[1:])]
    biases = [None] + [b - learning_rate*db for b, db in zip(biases[1:], dbiases[1:])]

    return weights, biases


# Gradient Checking

def grad_check(weights, biases, num_layers, X, y, decay, num_ex):
    z_values, acts = forward_propagation(weights, biases, num_layers, X)
    dweights, dbiases = backward_propagation(weights, biases, num_layers, z_values, acts, y)
epsilon = 1e-7
    dw_approx = cp.deepcopy(weights)
    db_approx = cp.deepcopy(biases)
    for layer in range(1, num_layers):
        height = weights[layer].shape[0]
        width = weights[layer].shape[1]
        for i in range(height):
            for j in range(width):
                w_plus = cp.deepcopy(weights)
                w_plus[layer][i, j] += epsilon
                w_minus = cp.deepcopy(weights)
                w_minus[layer][i, j] -= epsilon
                _, temp_plus = forward_propagation(w_plus, biases, num_layers, X)
                cost_plus = mse(temp_plus[-1], y, w_plus, decay, num_ex)
                _, temp_minus = forward_propagation(w_minus, biases, num_layers, X)
                cost_minus = mse(temp_minus[-1], y, w_minus, decay, num_ex)
                dw_approx[layer][i, j] = (cost_plus - cost_minus)/(2*epsilon)
            b_plus = cp.deepcopy(biases)
            b_plus[layer][i, 0] += epsilon
            b_minus = cp.deepcopy(biases)
            b_minus[layer][i, 0] -= epsilon
            _, temp_plus = forward_propagation(weights, b_plus, num_layers, X)
            cost_plus = mse(temp_plus[-1], y, weights, decay, num_ex)
            _, temp_minus = forward_propagation(weights, b_minus, num_layers, X)
            cost_minus = mse(temp_minus[-1], y, weights,  decay, num_ex)
            db_approx[layer][i, 0] = (cost_plus - cost_minus)/(2*epsilon)
    dweights_flat = [dw.flatten() for dw in dweights[1:]]
    dweights_flat = np.concatenate(dweights_flat, axis=None)
    dw_approx_flat = [dw.flatten() for dw in dw_approx[1:]]
    dw_approx_flat = np.concatenate(dw_approx_flat, axis=None)
    dbiases_flat = [db.flatten() for db in dbiases[1:]]
    dbiases_flat = np.concatenate(dbiases_flat, axis=None)
    db_approx_flat = [db.flatten() for db in db_approx[1:]]
    db_approx_flat = np.concatenate(db_approx_flat, axis=None)
    d_paras = np.concatenate([dweights_flat, dbiases_flat], axis=None)
    d_approx_paras = np.concatenate([dw_approx_flat, db_approx_flat], axis=None)
    difference = np.linalg.norm(d_paras - d_approx_paras)/(np.linalg.norm(d_paras) + 
np.linalg.norm(d_approx_paras))

    if difference > 2e-7:
        print(
        "\033[93m" + "There is a mistake in the backward propagation! difference = " + str(difference) + "\033[0m")
else:
    print(
        "\033[92m" + "Your backward propagation works perfectly fine! difference = " + str(difference) + "\033[0m")

return dweights, dbiases, dw_approx, db_approx

编辑：对我在代码中的一些旧注释进行了一些更正，以避免混淆

编辑 2：感谢 @sid_508 帮助我找到我的代码的主要问题。 我还想在此编辑中提到，我发现在实现权重衰减的方式中存在一些错误，在进行建议的更改并暂时完全删除权重衰减元素之后，神经网络似乎可以工作！

Answer 1

我运行了你的代码，这是它给出的 output：

问题是您也将 ReLU 用于最后一层，因此您无法获得最佳拟合，在最后一层不使用激活，它应该会产生更好的结果。

最终层激活通常总是与您用于隐藏层的不同，这取决于您要使用的 output 类型。 对于连续输出使用线性激活（基本上没有激活），对于分类使用 sigmoid/softmax。

我的神经网络用直线逼近 X^2

问题描述

1 个解决方案

解决方案1
2 已采纳 2020-04-07 10:10:00

我的神经网络用直线逼近 X^2

问题描述

1 个解决方案

解决方案1 2 已采纳 2020-04-07 10:10:00

解决方案1
2 已采纳 2020-04-07 10:10:00