Trying to understand Gradient Checking error in 3-layer Neural Network

Question

I am building a basic 3-layer Neural Network in Python. After writing a gradient function, I proceeded to running gradient checking on it with the numerical gradient. After getting a big relative difference, I unrolled both gradients for the weight matrices and compared them side by side.

Function Gradient      Numerical Gradient
-0.000968788380809     0.0
 0.0153540197907       0.0153540197889
-0.00584391679274     -0.00584391679048
-0.00490359558077     -0.00490359558514
-0.00171892592537     -0.0017189259216
 0.00913024106334      0.00913024106319
-0.0182154767069      -0.0182154767092
 0.0152611324409       0.01526113244
-0.00373505297372     -0.00373505297135
-0.00513225994728     -0.00513225994814
-0.00531954399401     -0.00531954399641
-0.0185748801227      -0.0185748801163
 0.00745186105851      0.00745186105267
 0.0134566626927       0.0134566626908
 0.0251548691426       0.0251548691388
 0.00609388350562      0.00609388350226
-0.00471176815719     -0.00471176815564
 0.0113580721225       0.0113580721228
 0.00465172663488      0.00465172663944
-0.0221326283708      -0.02213262837
 0.300007655583       -0.300007655583 <-diverges, corresponding to theta2
 0.155638694282       -0.15345321819
 0.147747817305       -0.149026829224
 0.150703152382       -0.172330417252
 0.156307235611       -0.116975643856
 0.136898763375       -0.170081036297
 0.0621121242042      -0.0621121242372
 0.0442762464937      -0.0187338352431
 0.0489123689979      -0.00938236375481
 0.0244392582651      -0.0465061209964
 0.0237741996575      -0.028319115235
 0.0313594790974      -0.0330473942922
 0.106306327946       -0.106306327941
 0.0348751481828      -0.0704775747806
 0.0303373211657      -0.0756744476749
 0.0633094699759      -0.0461971224763
 0.0524239030728      -0.0477244101571
 0.0633274024777      -0.0397657392082

 Relative Difference:
 6.61473694017

The first 20 elements in each list correspond to the gradient for the first weight matrix, and the remaining 18 correspond to the gradient for the second weight matrix. From what I can see, it appears as though the error occurs in the last 18 elements (and thus the theta2 matrix gradient) in the list, where the function gradient begins to differ from the "correct" numerical gradient. This also causes scipy.optimize.fmin_cg to give me the following:

Warning: Desired error not necessarily achieved due to precision loss.

Any help would be greatly appreciated! Here is the relevant code:

def sigmoid(z):
    return 1 / (1+np.exp(z))

def sigmoid_gradient(z):
    return sigmoid(z)*(1-sigmoid(z))

def randInitializeWeights(layer_in, layer_out):
    matrix = np.zeros((layer_out, 1 + layer_in))
    epsilon_init = 0.12
    matrix = np.random.rand(layer_out, 1+layer_in) * 2 * epsilon_init -epsilon_init
    return matrix

def gradient(theta, *args):

    X, y, num_inputs, num_hidden_units, num_labels, lamb = args

    m = len(X)

    theta1 = np.reshape(theta[0:(num_hidden_units*(num_inputs+1))],(num_hidden_units, (num_inputs+1)))
    theta2 = np.reshape(theta[(num_hidden_units*(num_inputs+1)):],(num_labels, num_hidden_units+1)) 

    theta1_grad = np.zeros(theta1.shape)
    theta2_grad = np.zeros(theta2.shape)

    delta1 = np.zeros(theta1.shape)
    delta2 = np.zeros(theta2.shape)

    for t in range(0, m):
        vec_y = np.zeros(num_labels)
        vec_y[y[t]] = 1
        vec_y = vec_y[:, np.newaxis]    

        #feedforward to compute all the neuron activations
        a_1 = np.r_[[1], X[t]]
        a_1 = a_1[:, np.newaxis]
        z_2 = np.dot(theta1, a_1)   
        a_2 = np.vstack([1, sigmoid(z_2)])  
        z_3 = np.dot(theta2, a_2)   
        a_3 = sigmoid(z_3)  

        #error for output nodes
        del3 = a_3 - vec_y     
        #error for hidden nodes
        del2 = np.multiply(np.dot(theta2.T, del3), sigmoid_gradient(np.vstack([1, z_2])))
        #remove bias unit
        del2 = del2[1:] 

        #accumulate gradient
        delta1 = delta1 + del2*a_1.T
        delta2 = delta2 + del3*a_2.T

    #no need to regularize the first column
    theta1_grad[:, 0] = (1/m)*delta1[:, 0]
    theta2_grad[:, 0] = (1/m)*delta2[:, 0]

    #regularize the rest
    theta1_grad[:, 1:] = ((1/m) * delta1[:, 1:]) + (lamb/m)*theta1[:, 1:]
    theta2_grad[:, 1:] = ((1/m) * delta2[:, 1:]) + (lamb/m)*theta2[:, 1:]

    #unroll
    grad = np.hstack([theta1_grad.ravel(), theta2_grad.ravel()])
    return grad


def gradientChecking(lamb):
    input_layer_size = 3
    hidden_layer_size = 5
    num_labels = 3
    m = 5

    theta1 = randInitializeWeights(input_layer_size, hidden_layer_size)
    theta2 = randInitializeWeights(hidden_layer_size, num_labels)

    X = np.random.rand(m, input_layer_size)
    y = np.array([1, 2, 0, 1, 2])

    nn_params = np.hstack([theta1.ravel(), theta2.ravel()])

    #calculate gradient with function
    grad = gradient(nn_params, X, y, input_layer_size, hidden_layer_size, num_labels, lamb)
    #calculate numerical gradient
    num_grad = computeNumericalGradient(lambda theta: computeCost(theta, X, y, input_layer_size, hidden_layer_size, num_labels, lamb), nn_params)

    print('Function Gradient', 'Numerical Gradient')
    for i in range(len(grad)):
        print(grad[i], num_grad[i])

    diff = np.linalg.norm(num_grad-grad)/np.linalg.norm(num_grad+grad)
    print('Relative Difference: ')
    print(diff)


def computeNumericalGradient(J, theta):
    numgrad = np.zeros(theta.shape)
    perturb = np.zeros(theta.shape)
    e = 0.0001

    for p in range(1, np.size(theta)):
        perturb[p] = e
        loss1 = J(theta - perturb)
        loss2 = J(theta + perturb)
        numgrad[p] = (loss2 - loss1) / (2*e)
        perturb[p] = 0
    return numgrad

Answer 1

You have an error in your sigmoid function. It should be like this:

def sigmoid(z):
    return 1 / (1+np.exp(-z))

I'm a little bit confused with the implementation of the back propagation algorithm. I would do it without the for loop.

You did not post your computeCost , so I programmed it and checked the gradients. In my case both columns are equal:

('Function Gradient', 'Numerical Gradient')
(-0.0087363416123043425, 0.0)
(0.017468375248392107, 0.0174683752529603)
(-0.0016267134050363559, -0.0016267134039793518)
(0.0018882373947080224, 0.0018882373997719526)
(-0.0063531428795779391, -0.0063531428762253483)
(0.0029882213493977773, 0.0029882213481435826)
(0.014295787205089885, 0.014295787205131916)
(-0.026668095974979808, -0.026668095973736428)
(0.0043373799514851595, 0.0043373799440971084)
(0.0063740837472641377, 0.0063740837497050506)
(0.0027102260448642525, 0.0027102260435896142)
(0.0067009063282609839, 0.0067009063298151261)
(-0.0029645476578591843, -0.0029645476562478734)
(-0.012000477453137556, -0.012000477451756808)
(-0.020065071389262716, -0.020065071393293721)
(0.010308693441913186, 0.010308693438876304)
(-0.0015996484140612609, -0.0015996484115099463)
(-0.0086037766244218914, -0.0086037766244828617)
(-0.0099431361329477934, -0.0099431361344493041)
(0.0062574996404342166, 0.0062574996406716821)
(0.30213488769328123, 0.3021348876908192)
(0.14900524972537924, 0.14900524972549789)
(0.13305168538400619, 0.13305168538479961)
(0.16730920742910549, 0.16730920743279754)
(0.14245586995768528, 0.14245586995365045)
(0.15465244296463604, 0.15465244296519742)
(0.10813908901043021, 0.10813908900342284)
(0.040844058224880242, 0.04084405822446513)
(0.040566215206120269, 0.040566215204762557)
(0.036451467449020114, 0.036451467448905817)
(0.065664340475228455, 0.065664340476168093)
(0.070753692265581092, 0.07075369226283712)
(0.088651862157018618, 0.088651862166777562)
(0.028272897964677978, 0.028272897965031518)
(0.026876928049457398, 0.026876928049812676)
(0.056512225949437798, 0.056512225949933992)
(0.051775047342360533, 0.051775047342772496)
(0.025689087137289929, 0.025689087135294386)
Relative Difference: 
0.00878484310135

Here is my code:

import numpy as np

def sigmoid(z):
    return 1 / (1+np.exp(-z))

def sigmoid_gradient(z):
    return sigmoid(z)*(1-sigmoid(z))

def randInitializeWeights(layer_in, layer_out):
    matrix = np.zeros((layer_out, 1 + layer_in))
    epsilon_init = 0.12
    matrix = np.random.rand(layer_out, 1+layer_in) * 2 * epsilon_init -epsilon_init
    return matrix

def gradient(theta, *args):

    X, y, num_inputs, num_hidden_units, num_labels, lamb = args

    m = len(X)

    y_bin = np.zeros((m, num_labels))

    for i in range(m):
        y_bin[i, y[i]] = 1

    theta1 = np.reshape(theta[0:(num_hidden_units*(num_inputs+1))],(num_hidden_units, (num_inputs+1)))  #5x4
    theta2 = np.reshape(theta[(num_hidden_units*(num_inputs+1)):],(num_labels, num_hidden_units+1))     #3x6

    theta1_grad = np.zeros(theta1.shape)
    theta2_grad = np.zeros(theta2.shape)

    delta1 = np.zeros(theta1.shape)
    delta2 = np.zeros(theta2.shape)


    #forward

    a_1 = np.hstack((np.ones((m, 1)), X))   #5x4

    z_2 = np.dot(a_1, theta1.transpose())   #5x5
    a_2 = sigmoid(z_2)                      #5x5

    a_2 = np.hstack((np.ones((m, 1)), a_2)) #5x6
    z_3 = np.dot(a_2, theta2.transpose())   #5x3

    h = sigmoid(z_3)                        #5x3


    #backward

    delta3 = h - y_bin                      #5x3
    delta2 = np.dot(delta3, theta2[:, 1:num_hidden_units+1]) * sigmoid_gradient(z_2) #5x5

    D1 = np.dot(delta2.transpose(), a_1)    #5x4
    D2 = np.dot(delta3.transpose(), a_2)    #3x6

    theta1_grad = D1/m      #5x4
    theta2_grad = D2/m      #3x6

    #regularization
    theta1_grad[:, 1:num_inputs+1] = theta1_grad[:, 1:num_inputs+1] +lamb/m*  theta1[:, 1:num_inputs+1]
    theta2_grad[:, 1:num_hidden_units+1] = theta2_grad[:, 1:num_hidden_units+1] +lamb/m*  theta2[:, 1:num_hidden_units+1]

    #unroll
    grad = np.hstack([theta1_grad.ravel(), theta2_grad.ravel()])
    return grad

def gradientChecking(lamb):
    input_layer_size = 3
    hidden_layer_size = 5
    num_labels = 3
    m = 5

    theta1 = randInitializeWeights(input_layer_size, hidden_layer_size)
    theta2 = randInitializeWeights(hidden_layer_size, num_labels)

    X = np.random.rand(m, input_layer_size)
    y = np.array([1, 2, 0, 1, 2])

    nn_params = np.hstack([theta1.ravel(), theta2.ravel()])

    #calculate gradient with function
    grad = gradient(nn_params, X, y, input_layer_size, hidden_layer_size, num_labels, lamb)
    #calculate numerical gradient
    num_grad = computeNumericalGradient(lambda theta: computeCost(theta, X, y, input_layer_size, hidden_layer_size, num_labels, lamb), nn_params)

    print('Function Gradient', 'Numerical Gradient')
    for i in range(len(grad)):
        print(grad[i], num_grad[i])

    diff = np.linalg.norm(num_grad-grad)/np.linalg.norm(num_grad+grad)
    print('Relative Difference: ')
    print(diff)

def computeCost(theta, X, y, num_inputs, num_hidden_units, num_labels, lamb):

    m = len(X)

    y_bin = np.zeros((m, num_labels))

    for i in range(m):
        y_bin[i, y[i]] = 1

    theta1 = np.reshape(theta[0:(num_hidden_units*(num_inputs+1))],(num_hidden_units, (num_inputs+1))) #5x4
    theta2 = np.reshape(theta[(num_hidden_units*(num_inputs+1)):],(num_labels, num_hidden_units+1)) #3x6

    a_1 = np.hstack((np.ones((m, 1)), X))   #5x4

    z_2 = np.dot(a_1, theta1.transpose())   #5x5
    a_2 = sigmoid(z_2)                      #5x5

    a_2 = np.hstack((np.ones((m, 1)), a_2)) #5x6
    z_3 = np.dot(a_2, theta2.transpose())   #5x3

    h = sigmoid(z_3)

    cost = np.sum(-y_bin * np.log(h)    -    (1-y_bin) * np.log(1-h))/m


    #regularization

    theta1_sq = theta1[:, 1:num_inputs+1] * theta1[:, 1:num_inputs+1];
    theta2_sq = theta2[:, 1:num_hidden_units+1] * theta2[:, 1:num_hidden_units+1];

    cost = cost + lamb/(2.0*m)*(np.sum(theta1_sq) + np.sum(theta2_sq))

    return cost

def computeNumericalGradient(J, theta):
    numgrad = np.zeros(theta.shape)
    perturb = np.zeros(theta.shape)
    e = 0.0001

    for p in range(1, np.size(theta)):
        perturb[p] = e
        loss1 = J(theta - perturb)
        loss2 = J(theta + perturb)

        numgrad[p] = (loss2 - loss1) / (2*e)
        perturb[p] = 0
    return numgrad


gradientChecking(1.0)

Trying to understand Gradient Checking error in 3-layer Neural Network

Question

1 answers

solution1
1 ACCPTED 2016-01-21 14:58:52

Trying to understand Gradient Checking error in 3-layer Neural Network

Question

1 answers

solution1 1 ACCPTED 2016-01-21 14:58:52

solution1
1 ACCPTED 2016-01-21 14:58:52