简体   繁体   中英

Not using recurrent_dropout in colab crashing model?

I'm trying to train a simple tensorflow model to detect the sentiment of tweets. The datatypes and sizes of arrays are consistent and the model trains just fine when the recurrent_dropout is set to some float value. However this disables cuDNN and I'd really like to speed this up (don't we all) but whenever I remove the recurrent dropout argument the model training will crash before the end of the first epoch.

Below is the relevant code, I've left out imports, and loading the csv files. After the relevant code are the final input dimensions and the error code. Additionally, I have figured out why colab seemed to be cutting the training data. Colab displays the number of sequences after it has been split into batches, so with the default batch size of 32 we were getting 859 sequences. The crashing issue when not using the recurrent dropout is still an issue. Side note, this code is a very rough draft with the data cleaning all being done within the same notebook, hence the lack of typical formatting.

def remove_case(X):
    removed_case = []
    X = X.copy()
    for text in X:
        text = str(text).lower()
        removed_case.append(text)
    X = removed_case
    return X


def remove_hyperlinks(X):
    removed_hyperlinks = []
    X = X.copy()
    for text in X:
        text = str(text)
        text = re.sub(r'http\S+', '', text)
        text = re.sub(r'https\S+', '', text)
        text = re.sub(r'www\S+', '', text)
        removed_hyperlinks.append(text)
    X = removed_hyperlinks
    return X


def remove_punctuation(X):
    removed_punc = []
    X = X.copy()
    for text in X:
        text = str(text)
        text = "".join([char for char in text if char not in punctuation])
        removed_punc.append(text)
    X = removed_punc
    return X


def split_text(X):
    split_tweets = []
    X = X.copy()
    for text in X:
        text = str(text).split()
        split_tweets.append(text)
    X = split_tweets
    return X


def map_sentiment(X, l, m, n):
    keys = ['negative', 'neutral', 'positive']
    values = [l, m, n]
    dictionary = dict(zip(keys, values))
    X = X.copy()
    X = X.map(dictionary)
    return X


# # def sentiment_to_onehot(X):
#     sentiment_foofs = []
#     X = X.copy()
#     for integer in X:
#         if integer == "negative":  # Negative
#             integer = [1, 0, 0]
#         elif integer == "neutral":  # Neutral
#             integer = [0, 1, 0]
#         elif integer == "positive":  # Positive
#             integer = [0, 0, 1]
#         else:
#             break
#         sentiment_foofs.append(integer)
#     X = sentiment_foofs
#     return X


train_no_punc_lowercase = train.copy()
train_no_punc_lowercase['text'] = remove_case(train_no_punc_lowercase['text'])
train_no_punc_lowercase['text'] = remove_hyperlinks(train_no_punc_lowercase['text'])
train_no_punc_lowercase['text'] = remove_punctuation(train_no_punc_lowercase['text'])
train_no_punc_lowercase['sentiment'] = map_sentiment(train_no_punc_lowercase['sentiment'], 0, 1, 2)
train_no_punc_lowercase.head()

test_no_punc_lowercase = test.copy()
test_no_punc_lowercase['text'] = remove_case(test_no_punc_lowercase['text'])
test_no_punc_lowercase['text'] = remove_hyperlinks(test_no_punc_lowercase['text'])
test_no_punc_lowercase['text'] = remove_punctuation(test_no_punc_lowercase['text'])
test_no_punc_lowercase['sentiment'] = map_sentiment(test_no_punc_lowercase['sentiment'], 0, 1, 2)

features = train.columns.tolist()
features.remove('textID')  # all unique, high cardinality feature
features.remove('selected_text')  # target
target = 'selected_text'

X_train_no_punc_lowercase = train_no_punc_lowercase[features]
y_train_no_punc_lowercase = train_no_punc_lowercase[target]
X_test_no_punc_lowercase = test_no_punc_lowercase[features]


def stemming_column(df_column):
    ps = PorterStemmer()
    stemmed_word_list = []
    for i, string in enumerate(df_column):
        tokens = word_tokenize(string)
        new_string = ""
        for j, words in enumerate(tokens):
            new_string = new_string + ps.stem(words) + " "
        stemmed_word_list.append(new_string)
    return stemmed_word_list


def create_lookup_table(list1, list2):
    main_list = []
    lookup_dict = {}
    i = 1  # used to create a value in the dictionary
    main_list.append(list1)
    main_list.append(list2)
    for list in main_list:
        for string in list:
            for word in string.split():
                if word not in lookup_dict:
                    lookup_dict[word] = i
                    i += 1
    return lookup_dict


def encode(input_list, input_dict):
    encoded_list = []
    for string in input_list:
        sentence_list = []
        for word in string.split():
            sentence_list.append(input_dict[word])  # value lookup from dictionary.. int
        encoded_list.append(sentence_list)
    return encoded_list


def pad_data(list_of_lists):
    padded_data = tf.keras.preprocessing.sequence.pad_sequences(list_of_lists, padding='post')
    return padded_data


def create_array_sentiment_integers(list):
    sent_int_list = []
    for sentiment in list:
        sent_int_list.append(sentiment)
    return np.asarray(sent_int_list, dtype=np.int32)


X_train_stemmed_list = stemming_column(X_train_no_punc_lowercase['text'])
X_test_stemmed_list = stemming_column(X_test_no_punc_lowercase['text'])
lookup_table = create_lookup_table(X_train_stemmed_list, X_test_stemmed_list)

X_train_encoded_list = encode(X_train_stemmed_list, lookup_table)
X_train_padded_data = pad_data(X_train_encoded_list)

Y_train = create_array_sentiment_integers(train_no_punc_lowercase['sentiment'])
max_features = 3  # 3 choices 0, 1, 2

Y_train_final = np.zeros((Y_train.shape[0], max_features), dtype=np.float32)
Y_train_final[np.arange(Y_train.shape[0]), Y_train] = 1.0

input_dimension = len(lookup_table) + 1
output_dimension = 64
input_length = 33

model = Sequential()
model.add(tf.keras.layers.Embedding(input_dim=input_dimension,
                                    output_dim=output_dimension,
                                    input_length=input_length,
                                    mask_zero=True))

model.add(tf.keras.layers.LSTM(512, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
model.add(tf.keras.layers.Dense(256, activation='sigmoid'))

model.add(tf.keras.layers.Dropout(0.2))
model.add(Dense(3, activation='softmax'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit(X_train_padded_data, Y_train_final, validation_split=0.20, epochs=10)

model.save('Tweet_sentiment.model')

Additionally, here are the shapes of the datasets..

x train shape:  (27481, 33, 1) x train type:  <class 'numpy.ndarray'> y train shape:  (27481, 3)

Error code

Epoch 1/3
363/859 [===========>..................] - ETA: 9s - loss: 0.5449 - accuracy: 0.5674
---------------------------------------------------------------------------
UnknownError                              Traceback (most recent call last)
<ipython-input-103-1d4af3962607> in <module>()
----> 1 model.fit(X_train_padded_data, Y_train_final, epochs=3,)

8 frames
/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
     58     ctx.ensure_initialized()
     59     tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
---> 60                                         inputs, attrs, num_outputs)
     61   except core._NotOkStatusException as e:
     62     if name is not None:

UnknownError:  [_Derived_]  CUDNN_STATUS_BAD_PARAM
in tensorflow/stream_executor/cuda/cuda_dnn.cc(1496): 'cudnnSetRNNDataDescriptor( data_desc.get(), data_type, layout, max_seq_length, batch_size, data_size, seq_lengths_array, (void*)&padding_fill)'
     [[{{node cond_38/then/_0/CudnnRNNV3}}]]
     [[sequential_5/lstm_4/StatefulPartitionedCall]] [Op:__inference_train_function_36098]

Function call stack:
train_function -> train_function -> train_function

I see some problems in your code. They are mentioned below:

  • You are using input_dimension = len(lookup_table) + 1 . len(lookup_table) is nothing but the Number of Time Steps . It's value will be very high, at least more than 30,000. It is recommended to use only subset of those Values. So, you can set input_dimension = 10000 or input_dimension = 15000 (you may experiment with this value) it should solve the problem. Having said that, it will not impact the Accuracy of the Model.

  • Why is setting Recurrent Dropout a Float Value working ==> When we set the Recurrent Dropout , it actually drops the Number of Time Steps , input_dimension in your case, and hence it is not crashing.

  • You should use return_sequences=True only if you have another LSTM Layer , after an LSTM Layer . Since you have only one LSTM Layer , return_sequences should be set to False
  • Since you have 3 Classes, you shouldn't use binary_crossentropy . You should use sparse_categorical_crossentropy if you are not One-Hot-Encoding your Target or categorical_crossentropy if you are One-Hot-Encoding your Target .
  • Are you sure you want to use Masking in Embedding Layer ?

Also, I see that you are using Many Functions and Many Lines of Code for Data-Preprocessing like Removing Hyperlinks , Removing Punctuations , Tokenizing , etc..

So, I thought I will provide an End-To-End Tutorial for Text Classification which will help you as well as the Stack Overflow Community . Code for the same is shown below:

#!pip install tensorflow==2.1
#!pip install nltk
#!pip install tika
#!pip install textblob
#!pip3 install --upgrade numpy
#!pip install scikit-learn

# To handle Paths
import os

# To remove Hyperlinks and Dates
import re

# To remove Puncutations
import string

# This helps to remove the unnecessary words from our Text Data
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

# To Parse the Input Data Files
from tika import parser

from textblob import TextBlob

# In order to use the Libraries of Tensorflow
import tensorflow as tf

# For Preprocessing the Text => To Tokenize the Text
from tensorflow.keras.preprocessing.text import Tokenizer
# If the Two Articles are of different length, pad_sequences will make the length equal
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Package for performing Numerical Operations
import numpy as np

# MatplotLib for Plotting Graphs
import matplotlib.pyplot as plt

# To shuffle the Data
from random import shuffle

# To Partition the Data into Train Data and Test Data
from sklearn.model_selection import train_test_split

# To add Regularizer in order to reduce Overfitting
from tensorflow.keras.regularizers import l2

# Give the Path of our Data
Path_Of_Data = 'Data'

# Extract the Labels from the Folders inside the Path mentioned above
Unique_Labels_List = ['negative', 'neutral', 'positive']

def GetNumericLabel(EachLabel):
    if EachLabel=='negative':
        return 0
    elif EachLabel=='neutral':
        return 1
    elif EachLabel=='positive':
        return 2

def Pre_Process_Data_And_Create_BOW(folder_path):
  #creating empty lists in order to Create Resume Text and the respective Label
  Resumes_List = []
  Labels_List = []
  for EachLabel in Unique_Labels_List:      
      for root, dirs, files in os.walk(os.path.join(folder_path, EachLabel),topdown=False):
        for file in files:
          i = 0
          if file.endswith('.pdf'):
            #Access individual file
            Full_Resume_Path = os.path.join(root, file)
            # Parse the Data inside the file
            file_data = parser.from_file(Full_Resume_Path)
            # Extract the Content of the File
            Resume_Text = file_data['content']

            # Below Code removes the Hyperlinks in the Resume, like LinkedIn Profile, Certifications, etc..
            HyperLink_Regex = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
            Text_Without_HL = re.sub(HyperLink_Regex, ' ', Resume_Text, flags=re.MULTILINE)

            # Below Code removes the Date from the Resume
            Date_regEx = r'(?:\d{1,2}[-/th|st|nd|rd\s]*)?(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)?[a-z\s,.]*(?:\d{1,2}[-/th|st|nd|rd)\s,]*)+(?:\d{2,4})+'
            CleanedText = re.sub(Date_regEx,' ',Text_Without_HL)

            List_Of_All_Punctuations = list(string.punctuation)
            Important_Punctuations = ['#', '.', '+' , '-'] #Add more, if any other Punctuation is observed as Important

            NewLineChar = '\n'

            # Below Set Comprises all the Punctuations, which can be Removed from the Text of Resume
            Total_Punct = len(List_Of_All_Punctuations)

            for EachImpPunct in Important_Punctuations:
                for CountOfPunct in range(Total_Punct):
                    if CountOfPunct == Total_Punct:
                        break
                    elif EachImpPunct == List_Of_All_Punctuations[CountOfPunct]:
                        del List_Of_All_Punctuations[CountOfPunct]
                        Total_Punct = Total_Punct - 1

            List_Of_All_Punctuations.append(NewLineChar)

            for EachPunct in List_Of_All_Punctuations:
                CleanedText = CleanedText.replace(EachPunct, " ")

            # Below Code converts all the Words in the Resume to Lowercase ======> Check if it has to come after Tokenization if Splitting Code is delet instead of integed
            #Final_Cleaned_Resume_Text = Text_Without_Punct.lower()
            Final_Cleaned_Resume_Text = CleanedText.lower()

            #Code to remove Stopwords from each Resume
            for word in STOPWORDS:
                #stop_token = ' ' + word + ' '
                stop_token = word
                Resume_Text = Final_Cleaned_Resume_Text.replace(stop_token, ' ')
                #Resume_Text = Resume_Text.replace(' ', ' ')
            Resumes_List.append(Resume_Text)
            Numeric_Label = GetNumericLabel(EachLabel)
            Labels_List.append(Numeric_Label)
      #print('Successfully executed for the Folder, ', EachLabel)
  #Return Final Lists
  return Resumes_List, Labels_List

#calling the function and passing the path
Resumes_List,  Labels_List = Pre_Process_Data_And_Create_BOW(Path_Of_Data)

vocab_size = 10000 # This is very important for you
# We want the Output of the Embedding Layer to be 64
embedding_dim = 64
max_length = 800
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'
# Taking 80% of the Data as Training Data and remaining 20% will be for Test Data
training_portion = .8

# Size of Train Data is 80% of the Entire Dataset => 0.8 * 2225

Train_Resume_Size = int(len(Resumes_List) * training_portion)

Labels_List = np.asarray(Labels_List)

Train_Resume_Data, Validation_Resume_Data, Train_Labels, Validation_Labels = \
                    train_test_split(Resumes_List, Labels_List, train_size = training_portion, 
                                     shuffle = True
                                     , stratify= Labels_List)

from statistics import mean

print('Average Number of Words in Each Training Resume is {}'.format(mean([len(i) for i in Train_Resume_Data])))

tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(Train_Resume_Data)
word_index = tokenizer.word_index

# Convert the Word Tokens into Integer equivalents, before passing it to keras embedding layer
train_sequences = tokenizer.texts_to_sequences(Train_Resume_Data)

train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

validation_sequences = tokenizer.texts_to_sequences(Validation_Resume_Data)
validation_padded = pad_sequences(validation_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

print(len(validation_sequences))
print(validation_padded.shape)

reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

# Check your Data
def decode_article(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])
print(decode_article(train_padded[10]))
print('-------------------------------------------------------------------------')
print(Train_Resume_Data[10])

Regularizer = l2(0.001)

model = tf.keras.Sequential([
    # Add an Embedding layer expecting input vocab of size 5000, and output embedding dimension of size 64 we set at the top
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              embeddings_regularizer = Regularizer),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    # use ReLU in place of tanh function since they are very good alternatives of each other.
    tf.keras.layers.Dense(embedding_dim, activation='relu'),
    # Add a Dense layer with 3 units and softmax activation.
    # When we have multiple outputs, softmax convert outputs layers into a probability distribution.
    tf.keras.layers.Dense(3, activation='softmax')
])
model.summary()

#Using Early Stopping in order to handle Overfitting
ES_Callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)

model.compile(loss = tf.keras.losses.SparseCategoricalCrossentropy(), optimizer='adam', metrics=['accuracy'])

num_epochs = 100

history = model.fit(x = train_padded, y = Train_Labels, epochs=num_epochs, 
                    callbacks=[ES_Callback],
                    validation_data=(validation_padded, Validation_Labels),
                    batch_size = 32, shuffle=True, verbose=1)

def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()

plot_graphs(history, "accuracy")
plot_graphs(history, "loss")

version = 1
MODEL_DIR = 'Resume_Classification_Model'
export_path = os.path.join(MODEL_DIR, str(version))

tf.keras.models.save_model(model = model, filepath = export_path)

!ls -l {export_path}

!saved_model_cli show --dir {export_path} --all

For more information, please refer this Beautiful Article .

Hope this solves your issue. Happy Learning!

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM