I am building a basic 3-layer Neural Network in Python. After writing a gradient function, I proceeded to running gradient checking on it with the numerical gradient. After getting a big relative difference, I unrolled both gradients for the weight matrices and compared them side by side.
Function Gradient Numerical Gradient
-0.000968788380809 0.0
0.0153540197907 0.0153540197889
-0.00584391679274 -0.00584391679048
-0.00490359558077 -0.00490359558514
-0.00171892592537 -0.0017189259216
0.00913024106334 0.00913024106319
-0.0182154767069 -0.0182154767092
0.0152611324409 0.01526113244
-0.00373505297372 -0.00373505297135
-0.00513225994728 -0.00513225994814
-0.00531954399401 -0.00531954399641
-0.0185748801227 -0.0185748801163
0.00745186105851 0.00745186105267
0.0134566626927 0.0134566626908
0.0251548691426 0.0251548691388
0.00609388350562 0.00609388350226
-0.00471176815719 -0.00471176815564
0.0113580721225 0.0113580721228
0.00465172663488 0.00465172663944
-0.0221326283708 -0.02213262837
0.300007655583 -0.300007655583 <-diverges, corresponding to theta2
0.155638694282 -0.15345321819
0.147747817305 -0.149026829224
0.150703152382 -0.172330417252
0.156307235611 -0.116975643856
0.136898763375 -0.170081036297
0.0621121242042 -0.0621121242372
0.0442762464937 -0.0187338352431
0.0489123689979 -0.00938236375481
0.0244392582651 -0.0465061209964
0.0237741996575 -0.028319115235
0.0313594790974 -0.0330473942922
0.106306327946 -0.106306327941
0.0348751481828 -0.0704775747806
0.0303373211657 -0.0756744476749
0.0633094699759 -0.0461971224763
0.0524239030728 -0.0477244101571
0.0633274024777 -0.0397657392082
Relative Difference:
6.61473694017
The first 20 elements in each list correspond to the gradient for the first weight matrix, and the remaining 18 correspond to the gradient for the second weight matrix. From what I can see, it appears as though the error occurs in the last 18 elements (and thus the theta2 matrix gradient) in the list, where the function gradient begins to differ from the "correct" numerical gradient. This also causes scipy.optimize.fmin_cg to give me the following:
Warning: Desired error not necessarily achieved due to precision loss.
Any help would be greatly appreciated! Here is the relevant code:
def sigmoid(z):
return 1 / (1+np.exp(z))
def sigmoid_gradient(z):
return sigmoid(z)*(1-sigmoid(z))
def randInitializeWeights(layer_in, layer_out):
matrix = np.zeros((layer_out, 1 + layer_in))
epsilon_init = 0.12
matrix = np.random.rand(layer_out, 1+layer_in) * 2 * epsilon_init -epsilon_init
return matrix
def gradient(theta, *args):
X, y, num_inputs, num_hidden_units, num_labels, lamb = args
m = len(X)
theta1 = np.reshape(theta[0:(num_hidden_units*(num_inputs+1))],(num_hidden_units, (num_inputs+1)))
theta2 = np.reshape(theta[(num_hidden_units*(num_inputs+1)):],(num_labels, num_hidden_units+1))
theta1_grad = np.zeros(theta1.shape)
theta2_grad = np.zeros(theta2.shape)
delta1 = np.zeros(theta1.shape)
delta2 = np.zeros(theta2.shape)
for t in range(0, m):
vec_y = np.zeros(num_labels)
vec_y[y[t]] = 1
vec_y = vec_y[:, np.newaxis]
#feedforward to compute all the neuron activations
a_1 = np.r_[[1], X[t]]
a_1 = a_1[:, np.newaxis]
z_2 = np.dot(theta1, a_1)
a_2 = np.vstack([1, sigmoid(z_2)])
z_3 = np.dot(theta2, a_2)
a_3 = sigmoid(z_3)
#error for output nodes
del3 = a_3 - vec_y
#error for hidden nodes
del2 = np.multiply(np.dot(theta2.T, del3), sigmoid_gradient(np.vstack([1, z_2])))
#remove bias unit
del2 = del2[1:]
#accumulate gradient
delta1 = delta1 + del2*a_1.T
delta2 = delta2 + del3*a_2.T
#no need to regularize the first column
theta1_grad[:, 0] = (1/m)*delta1[:, 0]
theta2_grad[:, 0] = (1/m)*delta2[:, 0]
#regularize the rest
theta1_grad[:, 1:] = ((1/m) * delta1[:, 1:]) + (lamb/m)*theta1[:, 1:]
theta2_grad[:, 1:] = ((1/m) * delta2[:, 1:]) + (lamb/m)*theta2[:, 1:]
#unroll
grad = np.hstack([theta1_grad.ravel(), theta2_grad.ravel()])
return grad
def gradientChecking(lamb):
input_layer_size = 3
hidden_layer_size = 5
num_labels = 3
m = 5
theta1 = randInitializeWeights(input_layer_size, hidden_layer_size)
theta2 = randInitializeWeights(hidden_layer_size, num_labels)
X = np.random.rand(m, input_layer_size)
y = np.array([1, 2, 0, 1, 2])
nn_params = np.hstack([theta1.ravel(), theta2.ravel()])
#calculate gradient with function
grad = gradient(nn_params, X, y, input_layer_size, hidden_layer_size, num_labels, lamb)
#calculate numerical gradient
num_grad = computeNumericalGradient(lambda theta: computeCost(theta, X, y, input_layer_size, hidden_layer_size, num_labels, lamb), nn_params)
print('Function Gradient', 'Numerical Gradient')
for i in range(len(grad)):
print(grad[i], num_grad[i])
diff = np.linalg.norm(num_grad-grad)/np.linalg.norm(num_grad+grad)
print('Relative Difference: ')
print(diff)
def computeNumericalGradient(J, theta):
numgrad = np.zeros(theta.shape)
perturb = np.zeros(theta.shape)
e = 0.0001
for p in range(1, np.size(theta)):
perturb[p] = e
loss1 = J(theta - perturb)
loss2 = J(theta + perturb)
numgrad[p] = (loss2 - loss1) / (2*e)
perturb[p] = 0
return numgrad
You have an error in your sigmoid function. It should be like this:
def sigmoid(z):
return 1 / (1+np.exp(-z))
I'm a little bit confused with the implementation of the back propagation algorithm. I would do it without the for
loop.
You did not post your computeCost
, so I programmed it and checked the gradients. In my case both columns are equal:
('Function Gradient', 'Numerical Gradient')
(-0.0087363416123043425, 0.0)
(0.017468375248392107, 0.0174683752529603)
(-0.0016267134050363559, -0.0016267134039793518)
(0.0018882373947080224, 0.0018882373997719526)
(-0.0063531428795779391, -0.0063531428762253483)
(0.0029882213493977773, 0.0029882213481435826)
(0.014295787205089885, 0.014295787205131916)
(-0.026668095974979808, -0.026668095973736428)
(0.0043373799514851595, 0.0043373799440971084)
(0.0063740837472641377, 0.0063740837497050506)
(0.0027102260448642525, 0.0027102260435896142)
(0.0067009063282609839, 0.0067009063298151261)
(-0.0029645476578591843, -0.0029645476562478734)
(-0.012000477453137556, -0.012000477451756808)
(-0.020065071389262716, -0.020065071393293721)
(0.010308693441913186, 0.010308693438876304)
(-0.0015996484140612609, -0.0015996484115099463)
(-0.0086037766244218914, -0.0086037766244828617)
(-0.0099431361329477934, -0.0099431361344493041)
(0.0062574996404342166, 0.0062574996406716821)
(0.30213488769328123, 0.3021348876908192)
(0.14900524972537924, 0.14900524972549789)
(0.13305168538400619, 0.13305168538479961)
(0.16730920742910549, 0.16730920743279754)
(0.14245586995768528, 0.14245586995365045)
(0.15465244296463604, 0.15465244296519742)
(0.10813908901043021, 0.10813908900342284)
(0.040844058224880242, 0.04084405822446513)
(0.040566215206120269, 0.040566215204762557)
(0.036451467449020114, 0.036451467448905817)
(0.065664340475228455, 0.065664340476168093)
(0.070753692265581092, 0.07075369226283712)
(0.088651862157018618, 0.088651862166777562)
(0.028272897964677978, 0.028272897965031518)
(0.026876928049457398, 0.026876928049812676)
(0.056512225949437798, 0.056512225949933992)
(0.051775047342360533, 0.051775047342772496)
(0.025689087137289929, 0.025689087135294386)
Relative Difference:
0.00878484310135
Here is my code:
import numpy as np
def sigmoid(z):
return 1 / (1+np.exp(-z))
def sigmoid_gradient(z):
return sigmoid(z)*(1-sigmoid(z))
def randInitializeWeights(layer_in, layer_out):
matrix = np.zeros((layer_out, 1 + layer_in))
epsilon_init = 0.12
matrix = np.random.rand(layer_out, 1+layer_in) * 2 * epsilon_init -epsilon_init
return matrix
def gradient(theta, *args):
X, y, num_inputs, num_hidden_units, num_labels, lamb = args
m = len(X)
y_bin = np.zeros((m, num_labels))
for i in range(m):
y_bin[i, y[i]] = 1
theta1 = np.reshape(theta[0:(num_hidden_units*(num_inputs+1))],(num_hidden_units, (num_inputs+1))) #5x4
theta2 = np.reshape(theta[(num_hidden_units*(num_inputs+1)):],(num_labels, num_hidden_units+1)) #3x6
theta1_grad = np.zeros(theta1.shape)
theta2_grad = np.zeros(theta2.shape)
delta1 = np.zeros(theta1.shape)
delta2 = np.zeros(theta2.shape)
#forward
a_1 = np.hstack((np.ones((m, 1)), X)) #5x4
z_2 = np.dot(a_1, theta1.transpose()) #5x5
a_2 = sigmoid(z_2) #5x5
a_2 = np.hstack((np.ones((m, 1)), a_2)) #5x6
z_3 = np.dot(a_2, theta2.transpose()) #5x3
h = sigmoid(z_3) #5x3
#backward
delta3 = h - y_bin #5x3
delta2 = np.dot(delta3, theta2[:, 1:num_hidden_units+1]) * sigmoid_gradient(z_2) #5x5
D1 = np.dot(delta2.transpose(), a_1) #5x4
D2 = np.dot(delta3.transpose(), a_2) #3x6
theta1_grad = D1/m #5x4
theta2_grad = D2/m #3x6
#regularization
theta1_grad[:, 1:num_inputs+1] = theta1_grad[:, 1:num_inputs+1] +lamb/m* theta1[:, 1:num_inputs+1]
theta2_grad[:, 1:num_hidden_units+1] = theta2_grad[:, 1:num_hidden_units+1] +lamb/m* theta2[:, 1:num_hidden_units+1]
#unroll
grad = np.hstack([theta1_grad.ravel(), theta2_grad.ravel()])
return grad
def gradientChecking(lamb):
input_layer_size = 3
hidden_layer_size = 5
num_labels = 3
m = 5
theta1 = randInitializeWeights(input_layer_size, hidden_layer_size)
theta2 = randInitializeWeights(hidden_layer_size, num_labels)
X = np.random.rand(m, input_layer_size)
y = np.array([1, 2, 0, 1, 2])
nn_params = np.hstack([theta1.ravel(), theta2.ravel()])
#calculate gradient with function
grad = gradient(nn_params, X, y, input_layer_size, hidden_layer_size, num_labels, lamb)
#calculate numerical gradient
num_grad = computeNumericalGradient(lambda theta: computeCost(theta, X, y, input_layer_size, hidden_layer_size, num_labels, lamb), nn_params)
print('Function Gradient', 'Numerical Gradient')
for i in range(len(grad)):
print(grad[i], num_grad[i])
diff = np.linalg.norm(num_grad-grad)/np.linalg.norm(num_grad+grad)
print('Relative Difference: ')
print(diff)
def computeCost(theta, X, y, num_inputs, num_hidden_units, num_labels, lamb):
m = len(X)
y_bin = np.zeros((m, num_labels))
for i in range(m):
y_bin[i, y[i]] = 1
theta1 = np.reshape(theta[0:(num_hidden_units*(num_inputs+1))],(num_hidden_units, (num_inputs+1))) #5x4
theta2 = np.reshape(theta[(num_hidden_units*(num_inputs+1)):],(num_labels, num_hidden_units+1)) #3x6
a_1 = np.hstack((np.ones((m, 1)), X)) #5x4
z_2 = np.dot(a_1, theta1.transpose()) #5x5
a_2 = sigmoid(z_2) #5x5
a_2 = np.hstack((np.ones((m, 1)), a_2)) #5x6
z_3 = np.dot(a_2, theta2.transpose()) #5x3
h = sigmoid(z_3)
cost = np.sum(-y_bin * np.log(h) - (1-y_bin) * np.log(1-h))/m
#regularization
theta1_sq = theta1[:, 1:num_inputs+1] * theta1[:, 1:num_inputs+1];
theta2_sq = theta2[:, 1:num_hidden_units+1] * theta2[:, 1:num_hidden_units+1];
cost = cost + lamb/(2.0*m)*(np.sum(theta1_sq) + np.sum(theta2_sq))
return cost
def computeNumericalGradient(J, theta):
numgrad = np.zeros(theta.shape)
perturb = np.zeros(theta.shape)
e = 0.0001
for p in range(1, np.size(theta)):
perturb[p] = e
loss1 = J(theta - perturb)
loss2 = J(theta + perturb)
numgrad[p] = (loss2 - loss1) / (2*e)
perturb[p] = 0
return numgrad
gradientChecking(1.0)
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.