[英]Is this the right input for a Neural Network?
我正在嘗試構建一個算法來預測文本是否厭惡女性。 我使用的數據來自 .csv 文件,並具有以下標簽:id、text、label(0 不是厭惡女性,1 是厭惡女性)。 該模型正在運行,因為我使用預制字典在預制數據集上進行了測試,因此問題一定是我處理數據的方式。 我嘗試使用詞袋方法制作我自己的字典,然后塑造我的數據,使其適合模型,但它在第一個時期停止。 完整代碼如下:
import random
import sklearn
import tensorflow as td
from tensorflow import keras
from sklearn import svm
from sklearn import metrics
import pandas as pd
import numpy as np
import string
import nltk
from collections import Counter
def get_corpus_vocabulary(corpus):
counter = Counter()
for text in corpus:
tokens=tokenize(text)
counter.update(tokens)
return counter
def tokenize(text):
return nltk.WordPunctTokenizer().tokenize(text)
def get_representation(vocabulary, how_many):
most_comm = vocabulary.most_common(how_many)
wd2idx = {}
idx2wd = {}
for position,word in enumerate(most_comm):
word=words[0]
wd2idx[word] = position
idx2wd[position] = word
return wd2idx, idx2wd
def shape_data(data):
#encoded_data = np.array()
data_clean = data.copy()
for line in range(len(data)):
transtable = str.maketrans('', '', string.punctuation)
data_clean[line] = data[line].translate(transtable).strip().split(" ")
encoded_line = encode(data_clean[line])
encoded_line = keras.preprocessing.sequence.pad_sequences([encoded_line], value=word_index["<PAD>"], padding="post", maxlen=35)
if line == 0:
encoded_data = np.array(encoded_line)
else:
encoded_data = np.append(encoded_data, encoded_line, axis = 0)
return encoded_data
def encode(word_list):
encoded = [1]
for word in word_list:
if word in word_index:
encoded.append(word_index[word])
else:
encoded.append(2)
return encoded
train_data=pd.read_csv('train.csv')
corpus=train_data['text']
train_labels = train_data['label'].values
all_words=get_corpus_vocabulary(corpus)
word_index, index_word = get_representation(all_words,100000)
#v+3 for 1,2,3; 0 never had a key
word_index = {k:(v+3) for k,v in word_index.items()} #dictionary
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2
word_index["<UNUSED>"] = 3
encoded_train_data = shape_data(corpus)
#model
model = keras.Sequential()
model.add(keras.layers.Embedding(100, 16))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(16, activation="relu"))
model.add(keras.layers.Dense(1, activation="sigmoid"))
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
x_val = encoded_train_data[:1000]
x_train = encoded_train_data[1000:]
y_val = train_labels[:1000]
y_train = train_labels[1000:]
fitModel = model.fit(x_train, y_train, epochs=40, batch_size=512, validation_data=(x_val, y_val), verbose=1)
results = model.evaluate(test_data, test_labels)
print(results)
我還要提一下,x_train、y_train、x_val 和 y_val 都是 numpy ndarrays。
錯誤是:
Traceback (most recent call last):
File "C:/programare/python/ProiectML/neuronalNetwork.py", line 99, in <module>
fitModel = model.fit(x_train, y_train, epochs=40, batch_size=512, validation_data=(x_val, y_val), verbose=1)
File "C:\Users\User\.conda\envs\tensor\lib\site-packages\tensorflow\python\keras\engine\training.py", line 108, in _method_wrapper
return method(self, *args, **kwargs)
File "C:\Users\User\.conda\envs\tensor\lib\site-packages\tensorflow\python\keras\engine\training.py", line 1098, in fit
tmp_logs = train_function(iterator)
File "C:\Users\User\.conda\envs\tensor\lib\site-packages\tensorflow\python\eager\def_function.py", line 780, in __call__
result = self._call(*args, **kwds)
File "C:\Users\User\.conda\envs\tensor\lib\site-packages\tensorflow\python\eager\def_function.py", line 840, in _call
return self._stateless_fn(*args, **kwds)
File "C:\Users\User\.conda\envs\tensor\lib\site-packages\tensorflow\python\eager\function.py", line 2829, in __call__
return graph_function._filtered_call(args, kwargs) # pylint: disable=protected-access
File "C:\Users\User\.conda\envs\tensor\lib\site-packages\tensorflow\python\eager\function.py", line 1848, in _filtered_call
cancellation_manager=cancellation_manager)
File "C:\Users\User\.conda\envs\tensor\lib\site-packages\tensorflow\python\eager\function.py", line 1924, in _call_flat
ctx, args, cancellation_manager=cancellation_manager))
File "C:\Users\User\.conda\envs\tensor\lib\site-packages\tensorflow\python\eager\function.py", line 550, in call
ctx=ctx)
File "C:\Users\User\.conda\envs\tensor\lib\site-packages\tensorflow\python\eager\execute.py", line 60, in quick_execute
inputs, attrs, num_outputs)
tensorflow.python.framework.errors_impl.InvalidArgumentError: indices[448,1] = 453 is not in [0, 100)
[[node sequential/embedding/embedding_lookup (defined at /programare/python/ProiectML/neuronalNetwork.py:99) ]] [Op:__inference_train_function_850]
Errors may have originated from an input operation.
Input Source operations connected to node sequential/embedding/embedding_lookup:
sequential/embedding/embedding_lookup/575 (defined at \Users\User\.conda\envs\tensor\lib\contextlib.py:112)
Function call stack:
train_function
改變這一行:
model.add(keras.layers.Embedding(len(word_index), 16))
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.