[英]Sentiment analysis with predefined text
我正在使用NLTK在Python中進行情感分析項目。 項目的輸出必須顯示給定的陳述是肯定的還是負面的。 我已經成功地做到了,但是如何獲得中立聲明的輸出呢? 是否可以百分比形式輸出(即正百分比,負百分比或中性百分比)?
classifier.py
import random
import preprocess
import nltk
def get_classifier():
data = preprocess.get_data()
random.shuffle(data)
split = int(0.8 * len(data))
train_set = data[:split]
test_set = data[split:]
classifier = nltk.NaiveBayesClassifier.train(train_set)
accuracy = nltk.classify.util.accuracy(classifier, test_set)
print("Generated Classifier")
print('-'*70)
print("Accuracy: ", accuracy)
return classifier
preprocess.py
import nltk.classify
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = stopwords.words("english")
def create_word_features_pos(words):
useful_words = [word for word in words if word not in stop_words]
my_list = [({word: True}, 'positive') for word in useful_words]
return my_list
def create_word_features_neg(words):
useful_words = [word for word in words if word not in stop_words]
my_list = [({word: True}, 'negative') for word in useful_words]
return my_list
def create_word_features(words):
useful_words = [word for word in words if word not in stopwords.words("english")]
pos_txt = get_tokenized_file(u"positive-words.txt")
neg_txt = get_tokenized_file(u"negative-words.txt")
my_dict = dict([(word, True) for word in pos_txt if word in useful_words])
my_dict1 = dict([(word, False) for word in neg_txt if word in useful_words])
my_dict3 = dict([word,])
my_dict.update(my_dict1)
return my_dict
def get_tokenized_file(file):
return word_tokenize(open(file, 'r').read())
def get_data():
print("Collecting Negative Words")
neg_txt = get_tokenized_file(u"negative-words.txt")
neg_features = create_word_features_neg(neg_txt)
print("Collecting Positive Words")
pos_txt = get_tokenized_file(u"positive-words.txt")
pos_features = create_word_features_pos(pos_txt)
return pos_features + neg_features
def process(data):
return [word.lower() for word in word_tokenize(data)]
nltk.NaiveBayesClassifier.train
的文檔:
參數:labeled_featuresets-分類特征集的列表,即元組列表(特征集,標簽)。
這意味着您的train_set
是一組(features, label)
的元組。
如果要添加neutral
類型,則需要將某些數據標記為neutral
否則分類器將無法學習該新類型。
現在,您將數據標記為: (word, True)
和(word, False)
,切換到3個標簽的示例是(word, 0)
, (word, 1)
, (word, 2)
nltk.NaiveBayesClassifier.prob_classify
將返回每個標簽的概率。
可以在這里找到文檔: https : //www.nltk.org/api/nltk.classify.html#nltk.classify.naivebayes.NaiveBayesClassifier
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.