為什么我的神經網絡在幾個 epoch 之后權重極低？

Question

我剛開始學習神經網絡，這是我的第一個。 問題是我擁有的數據越多，在 2-3 個 epoch 之后權重就越低，這是不尋常的，這導致我的 NN 什么也沒學到。

要在 DataSet class 中重新生成，搜索 function CreateData 並將 nbofexample 更改為 20 之類的值，您將看到是否打印了它們在正常范圍內的權重（在 -1 和 1 之間均勻間隔），但是如果您設置 nbofexample到 200 左右，然后僅在 2 或 3 個 epoch 之后，最后一層的大部分權重將與 0 非常接近，並且在訓練的 rest 中它們將停留在該區域。 顯然，這會導致 NN 失敗。

順便說一句，我的 NN 基本上是在分析 0 到 9 之間的數字除以 10 的 arrays 作為歸一化，以檢查數組是否已排序。 在下面的代碼中我放了很多注釋，代碼可以很容易理解。

可能有一個簡單的解決方法，但我就是不明白:(

如果您想嘗試，這里是完整的代碼：（它在 python 順便說一句）

import numpy as np
import time
import random
import time

#This class is only used for creating the data if needed
class DataSet():
    
    #check if sorted
    def checkPossibility(A):
        return sorted(A) == A

    #will be used later for more complex problems (taken from the faster answer of a coding challenge on LeetCode)
    #def checkPossibility(A):
    #    p = None
    #    for i in range(len(A) - 1):
    #        if A[i] > A[i+1]:
    #            if p is not None:
    #                return False
    #            p = i
    #    return (p is None or p == 0 or p == len(A)-2 or
    #            A[p-1] <= A[p+1] or A[p] <= A[p+2])
    

    #returns inputs and outputs using my poorly written algorithm
    def CreateData():
        
        #settings
        nbofchar=4
        nbofexample=200
        
        #initialize arrays
        inputs = [0]*nbofchar;
        output = [1]
        
        #handling dumbness
        if nbofexample>pow(10,nbofchar): 
            print("Too much data... resizing to max data")
            nbofexample=pow(10,nbofchar)
        elif nbofexample==0:
            print("You need examples to train! (Error nbofexample==0)")
        
        #if there is more than half of the max possible example being request, then create all possible examples and delete randomly until it's the requested size
        if nbofexample>pow(10,nbofchar)/2:
            
            #creating all possible examples
            for i in range(1,pow(10,nbofchar)): 
                new_ex = [int(a) for a in str(i)]
                while len(new_ex)<nbofchar:
                    new_ex=[0]+new_ex
                inputs = np.vstack((inputs,np.dot(new_ex,1/10)))  #normalization /10 so the value is between 0 and 1 ¯\_(ツ)_/¯
                output = np.vstack((output,[int(DataSet.checkPossibility(new_ex))]))
            
            #deleting     
            while len(inputs)>nbofexample:
                index = random.randint(0,len(inputs)-1)
                inputs = np.delete(inputs,index)
                output = np.delete(output,index)

            return inputs, output
        
        #if there is less than half (or half) then, create example randomly until it's the requested size
        else:
            i=1
            while i < nbofexample: 
                new_ex = [random.randint(0,9) for a in range(nbofchar)]
                if sum(np.any(inputs)==new_ex)==0:
                    i+=1
                    inputs = np.vstack((inputs,np.dot(new_ex,1/10)))    #normalization /10 so the value is between 0 and 1 ¯\_(ツ)_/¯
                    output = np.vstack((output,[int(DataSet.checkPossibility(new_ex))]))
            return inputs, output

#assigning weights to each layer
class NeuLayer():
    def __init__(self, nbofneuron, inputsperneuron):
        self.weight = 2 * np.random.random((inputsperneuron,nbofneuron))-1

#the actual neural network
class NeuNet():    

        def __init__(self, layers):
            self.layers = layers

        def _sigmoid(self, x):
            k = 1
            return 1 / (1+np.exp(-x/k))

        def _sigmoid_derivative(self, x):
            return x * (1-x)

        def train(self, training_set_inputs, training_set_outputs, nboftime):

            #debug
            timer1 = 0


            if len(self.layers)<2: return

            for iteration in range(nboftime):
                
                delta = [0] * len(self.layers)
                error = [0] * len(self.layers)
                outputlayers = self.think(training_set_inputs)
                
                #find deltas for each layer "i" (to be able to properly change weights)
                for i in range(len(self.layers)-1,-1,-1):
                    if i==len(self.layers)-1:
                        error[i] = training_set_outputs - outputlayers[i]                      
                    else:
                        error[i] = np.dot(delta[i+1],self.layers[i+1].weight.T)
                    delta[i] = error[i] * self._sigmoid_derivative(outputlayers[i])              


                #assign weigths for each layer "i"
                for i in range(len(self.layers)):
                   if i==0:
                       self.layers[0].weight += np.dot(training_set_inputs.T,delta[0])
                   else:
                       self.layers[i].weight += np.dot(outputlayers[i-1].T,delta[i])

                #display progression and the test result
                if Display_progression: 
                    if timer1<time.time():
                        timer1=time.time()+delay
                        value = ((iteration+1)/nboftime)*100
                        test_input = np.array([.1,.2,.1,.1])
                        print('%.2f'%value+"%     test_input = " + str(test_input) + "     test_output = "+ str(self.think(test_input)[-1]))

        #return output of each layer from an input
        def think(self, input):
            outforlayers = [None]*len(self.layers)
            outforlayer = input
            for i in range(len(self.layers)):
                outforlayer = self._sigmoid(np.dot(outforlayer, self.layers[i].weight))
                outforlayers[i] = outforlayer
            return outforlayers

#datamaker
creating_data=True
train = True

if creating_data:
    
    #creates files with inputs and their expected output
    print("Start creating data...")
    input, output = DataSet.CreateData();
    print("Data created!")
    file = open("data_input","wb")
    np.save(file, input)
    file.close;
    file = open("data_output","wb")
    np.save(file, output)
    file.close;

if train:

    default_data_set=False

    if default_data_set:
        #default training set
        inp_training = np.array([[0, 0, 0, 0, 0], [0.1, 0, 0, 0, 0], [0, 0.1, 0, 0, 0], [0.1, 0.1, 0, 0, 0], [0, 0, 0.1, 0, 0], [0.1, 0, 0.1, 0, 0], [0, 0.1, 0.1, 0, 0], [0.1, 0.1, 0.1, 0, 0],
                             [0, 0, 0, 0.1, 0], [0.1, 0, 0, 0.1, 0], [0, 0.1, 0, 0.1, 0], [0.1, 0.1, 0, 0.1, 0], [0, 0, 0.1, 0.1, 0], [0.1, 0, 0.1, 0.1, 0], [0, 0.1, 0.1, 0.1, 0], [0.1, 0.1, 0.1, 0.1, 0],
                             [0, 0, 0, 0, 0.1], [0.1, 0, 0, 0, 0.1], [0, 0.1, 0, 0, 0.1], [0.1, 0.1, 0, 0, 0.1], [0, 0, 0.1, 0, 0.1], [0.1, 0, 0.1, 0, 0.1], [0, 0.1, 0.1, 0, 0.1], [0.1, 0.1, 0.1, 0, 0.1],
                             [0, 0, 0, 0.1, 0.1], [0.1, 0, 0, 0.1, 0.1], [0, 0.1, 0, 0.1, 0.1], [0.1, 0.1, 0, 0.1, 0.1], [0, 0, 0.1, 0.1, 0.1], [0.1, 0, 0.1, 0.1, 0.1], [0, 0.1, 0.1, 0.1, 0.1], [0.1, 0.1, 0.1, 0.1, 0.1]])
        out_training = np.array([[0,0,0,0,0,0,0,1,
                             0,0,0,1,0,1,1,1,
                             0,0,0,1,0,1,1,1,
                             0,1,1,1,1,1,1,1]]).T

    else:
        print("Loading data files...")
        file = open("data_input","rb")
        inp_training = np.load(file)
        file.close;
        file = open("data_output","rb")
        out_training = np.load(file)
        file.close;
        print("Done reading from data files!")


    #debug
    Display_progression = True;
    delay = 1   #seconds

    #initialize
    np.random.seed(5)
    netlayer_input = NeuLayer(10,len(inp_training[0]))
    netlayer2 = NeuLayer(10,10)
    netlayer3 = NeuLayer(10,10)
    netlayer4 = NeuLayer(10,10)
    netlayer_out = NeuLayer(len(out_training[0]),10)
    All_layers = [netlayer_input,netlayer2,netlayer3,netlayer4,netlayer_out]
    brain = NeuNet(All_layers)

    #train
    print("Start training...")
    brain.train(inp_training, out_training, 100000)
    print("Done!")


    #final test
    outputfinal = brain.think(np.array([0,.1,.3,.7]))


    #output
    a = outputfinal[-1] #[-1] so we get the last layer's output(s)
    print(a)

注意這是我第一次在 stackoverflow 上提問，所以請告訴我是否遺漏了這個問題的關鍵信息。

Answer 1

神經網絡可能會遇到所謂的梯度消失問題，這是由更經典的激活（如 Sigmoid 或 Tanh）引起的。

用外行的話來說，基本上像 Sigmoid 和 Tanh 這樣的激活確實會擠壓輸入，對嗎？ 例如，sigmoid(10) 和 sigmoid(100) 分別是.9999 和 1。 盡管輸入變化很大，但輸出幾乎沒有變化 - function 在這一點上實際上是恆定的。 在 function 幾乎恆定的情況下，其導數趨於零（或非常小的值）。 這些非常小的導數/梯度彼此相乘並實際上變為零，從而阻止您的 model 學習任何東西 - 您的權重卡住並停止更新。

我建議你在自己的時間做一些關於這個主題的進一步閱讀。 在幾種解決方案中，解決此問題的一種方法是使用不同的激活，例如ReLU 。

為什么我的神經網絡在幾個 epoch 之后權重極低？

問題描述

1 個解決方案

解決方案1
0 已采納 2020-08-15 20:19:05

為什么我的神經網絡在幾個 epoch 之后權重極低？

問題描述

1 個解決方案

解決方案1 0 已采納 2020-08-15 20:19:05

解決方案1
0 已采納 2020-08-15 20:19:05