I am a beginner and I am trying to do sentiment classification on IMDB dataset by first using a TF-IDF vectorizer and then using the TF-IDF vector to train a neural.network for binary classification. As pre-processing, I have removed stopwords.
I have tried the following method but every time end up with a new error. Kindly please help on the best way to code the problem statement: I want to use TF-IDF vectorization along with Neural Networks for Binary sentiment Classification of imdb reviews.
I have written the following function to create the TF-IDF vectorizerr:
def Ngram_Vectorizer(reviews_train, reviews_test):
tfidf = TfidfVectorizer(analyzer = 'word', ngram_range=(2,2))
tfidf.fit(reviews_train)
feature_names = tfidf.get_feature_names()
reviews_train = tfidf.transform(reviews_train)
reviews_test = tfidf.transform(reviews_test)
return reviews_train, reviews_test
After getting the TF-IDF vectors, I am passing it to a shallow neural.network as follows:
def NeuralNetwork(reviews_train, labels_train, reviews_test, labels_test):
model = tf.keras.Sequential([
tf.keras.layers.Dense(256, input_shape = reviews_train.shape, activation = 'relu'),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(1, activation='sigmoid')])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(reviews_train, labels_train, validation_data = (reviews_test, labels_test), batch_size = 128, epochs = 5)
model.summary()
When I do the above, I get the following error:
Traceback (most recent call last):
File "neuralnetwork_tfidf_classifier.py", line 139, in <module>
main()
File "neuralnetwork_tfidf_classifier.py", line 136, in main
NeuralNetwork(reviews_train, labels_train, reviews_test, labels_test)
File "neuralnetwork_tfidf_classifier.py", line 72, in NeuralNetwork
model.fit(reviews_train, labels_train, validation_data = (reviews_test, labels_test), batch_size = 128, epochs = 5)
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\keras\engine\training.py", line 108, in _method_wrapper
return method(self, *args, **kwargs)
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\keras\engine\training.py", line 1063, in fit
steps_per_execution=self._steps_per_execution)
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\keras\engine\data_adapter.py", line 1117, in __init__
model=model)
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\keras\engine\data_adapter.py", line 573, in __init__
dataset = dataset_ops.DatasetV2.from_tensor_slices(inputs)
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\data\ops\dataset_ops.py", line 682, in from_tensor_slices
return TensorSliceDataset(tensors)
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\data\ops\dataset_ops.py", line 3003, in __init__
self._tensors = structure.to_batched_tensor_list(batched_spec, element)
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\data\util\structure.py", line 352, in to_batched_tensor_list
component), element_spec, element)
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\data\util\structure.py", line 326, in _to_tensor_list_helper
reduce_fn, zip(nest.flatten(element_spec), nest.flatten(element)), [])
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\data\util\structure.py", line 323, in reduce_fn
return encode_fn(state, spec, component)
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\data\util\structure.py", line 352, in <lambda>
component), element_spec, element)
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\framework\sparse_tensor.py", line 346, in _to_batched_tensor_list
out_type=dtypes.variant)]
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\ops\gen_sparse_ops.py", line 498, in serialize_many_sparse
_ops.raise_from_not_ok_status(e, name)
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\framework\ops.py", line 6843, in raise_from_not_ok_status
six.raise_from(core._status_to_exception(e.code, message), None)
File "<string>", line 3, in raise_from
tensorflow.python.framework.errors_impl.InvalidArgumentError: indices[1] = [0,3105402] is out of order. Many sparse ops require sorted indices.
Use `tf.sparse.reorder` to create a correctly ordered copy.
[Op:SerializeManySparse]
Then to solve the above error I used tf.sparse.reorder(reviews_train), tf.sparse.reorder(labels_train) as follows to modify my input to neural.network:
def NeuralNetwork(reviews_train, labels_train, reviews_test, labels_test):
model = tf.keras.Sequential([
tf.keras.layers.Dense(256, input_shape = reviews_train.shape, activation = 'relu'),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(1, activation='sigmoid')])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(tf.sparse.reorder(reviews_train), tf.sparse.reorder(labels_train), validation_data = (tf.sparse.reorder(reviews_test), tf.sparse.reorder(labels_test)), batch_size = 128, epochs = 5)
model.summary()
But I got a new error which is as follows:
Traceback (most recent call last):
File "neuralnetwork_tfidf_classifier.py", line 139, in <module>
main()
File "neuralnetwork_tfidf_classifier.py", line 136, in main
NeuralNetwork(reviews_train, labels_train, reviews_test, labels_test)
File "neuralnetwork_tfidf_classifier.py", line 72, in NeuralNetwork
model.fit(tf.sparse.reorder(reviews_train), tf.sparse.reorder(labels_train), validation_data = (tf.sparse.reorder(reviews_test), tf.sparse.reorder(labels_test)), batch_size = 128, epochs = 5)
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\ops\sparse_ops.py", line 823, in sparse_reorder
sp_input = _convert_to_sparse_tensor(sp_input)
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\ops\sparse_ops.py", line 71, in _convert_to_sparse_tensor
raise TypeError("Input must be a SparseTensor.")
TypeError: Input must be a SparseTensor.
Thirdly I tried using todense() on the TF-IDF output as follows:
def Ngram_Vectorizer(reviews_train, reviews_test):
tfidf = TfidfVectorizer(analyzer = 'word', ngram_range=(2,2))
tfidf.fit(reviews_train)
#print(tfidf.vocabulary_)
feature_names = tfidf.get_feature_names()
#print(feature_names)
#pickle.dump(tfidf, open('tfidf_vectorizer.pkl', 'wb'))
reviews_train = tfidf.transform(reviews_train).todense()
reviews_test = tfidf.transform(reviews_test).todense()
return reviews_train, reviews_test
But again this threw the following error:
Traceback (most recent call last):
File "neuralnetwork_tfidf_classifier.py", line 141, in <module>
main()
File "neuralnetwork_tfidf_classifier.py", line 129, in main
reviews_train, reviews_test= Ngram_Vectorizer(reviews_train, reviews_test)
File "neuralnetwork_tfidf_classifier.py", line 50, in Ngram_Vectorizer
reviews_train = tfidf.transform(reviews_train).todense()
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python37\lib\site-packages\scipy\sparse\base.py", line 847, in todense
return asmatrix(self.toarray(order=order, out=out))
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python37\lib\site-packages\scipy\sparse\compressed.py", line 1025, in toarray
out = self._process_toarray_args(order, out)
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python37\lib\site-packages\scipy\sparse\base.py", line 1185, in _process_toarray_args
return np.zeros(self.shape, dtype=self.dtype, order=order)
MemoryError: Unable to allocate 1.14 TiB for an array with shape (50010, 3140004) and data type float64
Any help or guidance towards the right direction will be very helpful. Thank you everyone in advance.
The last error means that the training would run, but the memory of your PC is not sufficient (happens often even to the biggest computers). Try running it with only a small part of the dataset to see whether it works.
Change the type to float instead of sparse. See below:
reviews_train = tfidf.transform(reviews_train).astype('float16')
reviews_test = tfidf.transform(reviews_test).astype('float16')
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.