[英]Neural Network for MNIST digits is not learning at all - problem with backpropagation
[英]MNIST Neural Network not learning - Micheal Nielsen Example
我一直在嘗試編寫用於識別 Michael Nielsen 在這里http://neuralnetworksanddeeplearning.com/chap1.html給出的 MNIST 的神經網絡
原件是使用 Python 2.7 編寫的,我相信,我使用的是 v3。 該網絡通過測試示例執行 go 並更新權重和偏差,但它沒有學習,並且得到了大約 10% 的測試示例正確(與隨機猜測一樣好)。
我還嘗試簡單地從站點復制代碼並在 Python 2.7 中運行它,它可以正常工作(准確率高達 95%)。 網絡中唯一顯着的區別是數據集(我使用的是兩天前直接從 MINST 下載的數據集)和我將np.dot
切換到np.outer
的兩個位置,只是為了更容易跟蹤數組形狀(我嘗試堅持使用(N,)而不是(N,1))。 但是那部分似乎很好,因為層大小不同並且乘法正在經歷。 我也使用與示例中相同的學習率和層大小。
我看不出是什么東西把網扔掉了。 如果有人嘗試過這樣做,或者對此有所了解,我將不勝感激。
謝謝
編碼:
import matplotlib.pyplot as plt
import numpy as np
import idx2numpy
import random
### LOAD DATASET ###
train = idx2numpy.convert_from_file("mnist/train-images.idx3-ubyte")
train_labels = idx2numpy.convert_from_file("mnist/train-labels.idx1-ubyte")
test = idx2numpy.convert_from_file("mnist/t10k-images.idx3-ubyte")
test_labels = idx2numpy.convert_from_file("mnist/t10k-labels.idx1-ubyte")
def vectorize(x):
e = np.zeros(10)
e[x] = 1.0
return e
training_images = [np.reshape(i, (784))/255 for i in train]
training_labels = [vectorize(i) for i in train_labels]
training_set = list(zip(training_images,training_labels))
test_images = [np.reshape(i, (784))/255 for i in test]
test_set = list(zip(training_images,test_labels))
### NETWORK CLASS ###
class myNet():
def __init__ (self , sizes ):
self.sizes = sizes
self.N = len(sizes)
self.w = [np.random.randn(y, x) for x, y in zip( sizes [: -1] , sizes [1:]) ]
self.b = [np.random.randn(i) for i in sizes[1:]]
def sigmoid (self,z):
return 1.0/(1.0+ np.exp(-z))
def sigmoid_prime (self,z):
return self.sigmoid (z)*(1 - self.sigmoid (z))
def cost_derivative (self,output_activations , y):
return ( output_activations - y)
def feedforward (self , a):
for bb, ww in zip(self.b , self.w ):
a = self.sigmoid (np.dot(ww, a)+bb)
return a
def backprop (self , x, y):
nabla_b = [np. zeros (bb. shape ) for bb in self.b ]
nabla_w = [np. zeros (ww. shape ) for ww in self.w ]
activation = x
activations = [x] # list to store all the activations , layer by layer
zs = [] # list to store all the z vectors , layer by layer
for bb, ww in zip(self.b , self. w ):
z = np.dot(ww, activation )+bb
zs. append (z)
activation = self.sigmoid (z)
activations . append ( activation )
# backward pass
delta = self. cost_derivative ( activations [-1], y) * self.sigmoid_prime (zs [ -1])
nabla_b [-1] = delta
nabla_w [-1] = np.outer(delta , activations [ -2])
# Note that the variable l in the loop below is used a little
for l in range (2, self.N ):
z = zs[-l]
sp = self.sigmoid_prime (z)
delta = np.dot(self.w [-l+1]. transpose () , delta ) * sp
nabla_b [-l] = delta
nabla_w [-l] = np.outer(delta , activations [-l -1])
return (nabla_b , nabla_w )
def update(self,mini_batch,eta):
nabla_b = [np.zeros (bb.shape ) for bb in self.b ]
nabla_w = [np.zeros (ww.shape ) for ww in self.w ]
for x, y in mini_batch :
delta_nabla_b , delta_nabla_w = self. backprop (x, y)
nabla_b = [nb+dnb for nb , dnb in zip(nabla_b , delta_nabla_b )]
nabla_w = [nw+dnw for nw , dnw in zip(nabla_w , delta_nabla_w )]
self.w = [ww -( eta/len( mini_batch ))*nw
for ww, nw in zip(self.w , nabla_w )]
self.b = [bb -( eta/len( mini_batch ))*nb
for bb, nb in zip(self.b, nabla_b )]
return
def gradient_descent(self,training_data,epochs,mini_batch_size,eta,test_data):
i = 0
n = len( training_data )
for j in range (epochs):
random.shuffle (training_data)
mini_batches = [
training_data [k:k+ mini_batch_size ]
for k in range (0, n, mini_batch_size )]
for mini_batch in mini_batches :
self.update( mini_batch , eta)
print("Epoch {0}: {1}". format (
j, self.evaluate(test_data)))
return
def evaluate (self, test_data):
test_results = [( np.argmax (self.feedforward (x)), y)
for (x, y) in test_data ]
return sum(int(x == y) for (x, y) in test_results )
sizes =[28*28, 30, 10]
net = myNet(sizes)
net.gradient_descent(training_set,30,10,3.0,test_set)
我發現了錯誤......我錯誤地將帶有測試標簽的訓練圖像壓縮成測試集,這顯然不是應該的。 現在我正確地形成了測試集,一切正常,准確率高達 95% 左右。 這是完整的更正代碼(可在 Python3 中使用)
import matplotlib.pyplot as plt
import numpy as np
import idx2numpy
import random
def vectorize(x):
e = np.zeros(10)
e[x] = 1.0
return e
### LOAD DATASET ###
train_images = idx2numpy.convert_from_file("mnist/train-images.idx3-ubyte")/255
train_labels = idx2numpy.convert_from_file("mnist/train-labels.idx1-ubyte")
train_images = [np.reshape(x,(784)).astype('float32') for x in train_images]
train_labels = [vectorize(i) for i in train_labels]
test_images = idx2numpy.convert_from_file("mnist/t10k-images.idx3-ubyte")/255
test_labels = idx2numpy.convert_from_file("mnist/t10k-labels.idx1-ubyte")
test_images = [np.reshape(x,(784)).astype('float32') for x in test_images]
training_set = list(zip(train_images,train_labels))
test_set = list(zip(test_images,test_labels)) ## THIS IS WHERE I MESSED UP
### NETWORK CLASS ###
class myNet():
def __init__ (self , sizes ):
self.sizes = sizes
self.N = len(sizes)
self.w = [np.random.randn(y, x) for x, y in zip( sizes [: -1] , sizes [1:]) ]
self.b = [np.random.randn(i) for i in sizes[1:]]
def sigmoid (self,z):
return 1.0/(1.0+ np.exp(-z))
def sigmoid_prime (self,z):
return self.sigmoid (z)*(1 - self.sigmoid (z))
def cost_derivative (self,output_activations , y):
return ( output_activations - y)
def feedforward (self , a):
for bb, ww in zip(self.b , self.w ):
a = self.sigmoid (np.dot(ww, a)+bb)
return a
def backprop (self , x, y):
nabla_b = [np. zeros (bb. shape ) for bb in self.b ]
nabla_w = [np. zeros (ww. shape ) for ww in self.w ]
activation = x
activations = [x] # list to store all the activations , layer by layer
zs = [] # list to store all the z vectors , layer by layer
for bb, ww in zip(self.b , self. w ):
z = np.dot(ww, activation )+bb
zs. append (z)
activation = self.sigmoid (z)
activations . append ( activation )
# backward pass
delta = self. cost_derivative ( activations [-1], y) * self.sigmoid_prime (zs [ -1])
nabla_b [-1] = delta
nabla_w [-1] = np.outer(delta , activations [ -2])
# Note that the variable l in the loop below is used a little
for l in range (2, self.N ):
z = zs[-l]
sp = self.sigmoid_prime (z)
delta = np.dot(self.w [-l+1]. transpose () , delta ) * sp
nabla_b [-l] = delta
nabla_w [-l] = np.outer(delta , activations [-l -1])
return (nabla_b , nabla_w )
def update(self,mini_batch,eta):
nabla_b = [np.zeros (bb.shape ) for bb in self.b ]
nabla_w = [np.zeros (ww.shape ) for ww in self.w ]
for x, y in mini_batch :
delta_nabla_b , delta_nabla_w = self. backprop (x, y)
nabla_b = [nb+dnb for nb , dnb in zip(nabla_b , delta_nabla_b )]
nabla_w = [nw+dnw for nw , dnw in zip(nabla_w , delta_nabla_w )]
self.w = [ww -( eta/len( mini_batch ))*nw
for ww, nw in zip(self.w , nabla_w )]
self.b = [bb -( eta/len( mini_batch ))*nb
for bb, nb in zip(self.b, nabla_b )]
return
def gradient_descent(self,training_data,epochs,mini_batch_size,eta,test_data):
i = 0
n = len( training_data )
for j in range (epochs):
random.shuffle (training_data)
mini_batches = [
training_data [k:k+ mini_batch_size ]
for k in range (0, n, mini_batch_size )]
for mini_batch in mini_batches :
self.update( mini_batch , eta)
print("Epoch {0}: {1}". format (
j, self.evaluate(test_data)))
return
def evaluate (self, test_data):
test_results = [( np.argmax (self.feedforward (x)), y)
for (x, y) in test_data ]
return sum(int(x == y) for (x, y) in test_results )
sizes =[28*28, 30, 10]
net = myNet(sizes)
net.gradient_descent(training_set,30,10,3.0,test_set)
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.