简体   繁体   中英

Why is my error value increasing for this simple implementation of a neural network?

My error value increases when I subtract the gradient x learning rate (of 0.5) from each parameter.

Shouldn't the error value be decreasing when I do so?

Code:

# -*- coding: utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt

params = {}

params["W1"] = np.random.rand(5,20)
params["W2"] = np.random.rand(20,10)
params["b1"] = np.zeros(20)
params["b2"] = np.zeros(10)

# creating training data values
S = np.array([[1,0,0,1,0],
             [1,0,0,0,1],
             [1,0,0,1,1],
             [0,0,1,1,0],
             [0,1,1,0,0],
             [0,1,1,0,0],
             [0,0,0,0,1],
             [0,0,1,0,0],
             [1,0,0,0,1],
             [1,1,1,0,1]])

# creating actual classification values for the training data
simple_array = np.array([0,1,2,3,4,5,6,7,8,9])
t_train = np.zeros((simple_array.size, simple_array.max()+1), dtype=int)
t_train[np.arange(simple_array.size),simple_array] = 1 


def predict(S):
    #find a1
    a1 = np.dot(S, params["W1"]) + params["b1"]
    
    # calculate z1 = relu(a1)
    z1 = []
    for row in a1:
        row0 = []
        for element in row:
            row0.append(max(0.0, element))
        z1.append(row)
    z1 = np.array(z1)
    
    # calculate a2 = b2 + z1*W2
    a2 = np.dot(z1,params["W2"]) + params["b2"]
    
    # calculate z2 = softmax(a2)
    
    z2 = []
    for i in range(len(a2)):
        row = a2[i] - np.max(a2[i])
        summation = np.sum(np.exp(row))
        row = np.exp(row)/summation
        z2.append(row)
    z2 = np.array(z2)
    return z2

def loss(S):
    predictions = predict(S) #10*10 array
    error = -np.sum(t_train*np.log(predictions + 1e-7))
    return error

loss_list = []
loss_list.append(loss(S))

#finding numerical derivative and updating parameters

h = 0.0000001
for m in range(100):
    for i in range(int(params["W1"].shape[0])):
        for j in range(int(params["W1"].shape[1])):
            params["W1"][i][j] += h
            bef = loss(S)
            params["W1"][i][j] -= h*2
            aft = loss(S)
            params["W1"][i][j] += h
            deriv = (aft - bef)/(2*h) 
            params["W1"][i][j] -= 0.5*deriv
    
    
    for i in range(int(params["W2"].shape[0])):
        for j in range(int(params["W2"].shape[1])):
            params["W2"][i][j] += h
            bef = loss(S)
            params["W2"][i][j] -= h*2
            aft = loss(S)
            params["W2"][i][j] += h
            deriv = (aft - bef)/(2*h) 
            params["W2"][i][j] -= 0.5*deriv
    
    for i in range(int(params["b1"].shape[0])):
            params["b1"][i] += h
            bef = loss(S)
            params["b1"][i] -= h*2
            aft = loss(S)
            params["b1"][i] += h
            deriv = (aft - bef)/(2*h) 
            params["b1"][i] -= 0.5*deriv
            
    for i in range(int(params["b2"].shape[0])):
            params["b2"][i] += h
            bef = loss(S)
            params["b2"][i] -= h*2
            aft = loss(S)
            params["b2"][i] += h
            deriv = (aft - bef)/(2*h) 
            params["b2"][i] -= 0.5*deriv

    loss_list.append(loss(S))

plt.plot(np.array(loss_list))

This is the graph I get: Plot obtained by subtracting gradient x learning rate from parameters

Also, when I add the gradient x learning rate to parameters, I get a decreasing loss function. I am not sure why, as I would expect the opposite to happen.

Plot obtained by adding gradient x learning rate to parameters

The main reason why your loss is increasing is that you calculate the opposite of the symmetric difference quotient .

In your code, you have:

params["W1"][i][j] += h
bef = loss(S)
params["W1"][i][j] -= h*2
aft = loss(S)
params["W1"][i][j] += h
deriv = (aft - bef)/(2*h) 
params["W1"][i][j] -= 0.5*deriv

If you switch the second and fourth lines (and do the same for other hyperparameters), the loss starts to decrease.

Some other problems include not actually calculating the ReLU activation function (you're appending row instead of row0 to the new array) and I'm not sure if changing the value of the parameter while calculating its gradient is correct. Besides that, the code looks ready for some hyperparameter tuning:)

Here's my final version:

# -*- coding: utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt

params = {}

params["W1"] = np.random.rand(5, 20)
params["W2"] = np.random.rand(20, 10)
params["b1"] = np.zeros(20)
params["b2"] = np.zeros(10)

# creating training data values
S = np.array(
    [
        [1, 0, 0, 1, 0],
        [1, 0, 0, 0, 1],
        [1, 0, 0, 1, 1],
        [0, 0, 1, 1, 0],
        [0, 1, 1, 0, 0],
        [0, 1, 1, 0, 0],
        [0, 0, 0, 0, 1],
        [0, 0, 1, 0, 0],
        [1, 0, 0, 0, 1],
        [1, 1, 1, 0, 1],
    ]
)

# creating actual classification values for the training data
simple_array = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
t_train = np.zeros((simple_array.size, simple_array.max() + 1), dtype=int)
t_train[np.arange(simple_array.size), simple_array] = 1


def predict(S):
    # find a1
    a1 = np.dot(S, params["W1"]) + params["b1"]

    # calculate z1 = relu(a1)
    z1 = []
    for row in a1:
        row0 = []
        for element in row:
            row0.append(max(0.0, element))
        z1.append(row0)
    z1 = np.array(z1)
    assert np.all(z1 >= 0.0)

    # calculate a2 = b2 + z1*W2
    a2 = np.dot(z1, params["W2"]) + params["b2"]

    # calculate z2 = softmax(a2)

    z2 = []
    for i in range(len(a2)):
        row = a2[i] - np.max(a2[i])
        summation = np.sum(np.exp(row))
        row = np.exp(row) / summation
        z2.append(row)
    z2 = np.array(z2)
    assert np.allclose(np.sum(z2, axis=1), 1.0)
    return z2


def loss(S):
    predictions = predict(S)  # 10*10 array
    error = -np.sum(t_train * np.log(predictions + 1e-7))
    return error


loss_list = []
loss_list.append(loss(S))

# finding numerical derivative and updating parameters

h = 0.0000001
alpha = 0.5
for m in range(100):
    W1 = params["W1"].copy()
    for i in range(int(params["W1"].shape[0])):
        for j in range(int(params["W1"].shape[1])):
            params["W1"][i][j] += h
            aft = loss(S)
            params["W1"][i][j] -= h * 2
            bef = loss(S)
            params["W1"][i][j] += h
            deriv = (aft - bef) / (2 * h)
            W1[i][j] = params["W1"][i][j] - alpha * deriv
    params["W1"] = W1

    W2 = params["W2"].copy()
    for i in range(int(params["W2"].shape[0])):
        for j in range(int(params["W2"].shape[1])):
            params["W2"][i][j] += h
            aft = loss(S)
            params["W2"][i][j] -= h * 2
            bef = loss(S)
            params["W2"][i][j] += h
            deriv = (aft - bef) / (2 * h)
            W2[i][j] = params["W2"][i][j] - alpha * deriv
    params["W2"] = W2

    b1 = params["b1"].copy()
    for i in range(int(params["b1"].shape[0])):
        params["b1"][i] += h
        aft = loss(S)
        params["b1"][i] -= h * 2
        bef = loss(S)
        params["b1"][i] += h
        deriv = (aft - bef) / (2 * h)
        b1[i] = params["b1"][i] - alpha * deriv
    params["b1"] = b1

    b2 = params["b2"].copy()
    for i in range(int(params["b2"].shape[0])):
        params["b2"][i] += h
        aft = loss(S)
        params["b2"][i] -= h * 2
        bef = loss(S)
        params["b2"][i] += h
        deriv = (aft - bef) / (2 * h)
        b2[i] = params["b2"][i] - alpha * deriv
    params["b2"] = b2

    loss_epoch = loss(S)
    print(m, loss_epoch)
    loss_list.append(loss_epoch)

plt.plot(np.array(loss_list))

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM