[英]How to do sentiment analysis with topic modeling or NER [ Python]?
我有以下用于情绪分析的代码。 我想知道如何在其中包含主题建模或 NER? (数据集是客户对 3 个网站的评论,一个 csv 文件,有 2 列,一个是评论,一个是评分,0 为负面,1 为正面)
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
dataset = pd.read_csv('full_db.csv')
X = dataset.iloc[:,0].values
y = dataset.iloc[:, 1].values
corpus = []
for i in range(0, len(X)):
review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i]) #replace punctuations with space
review = review.lower() #transfering all the letters to lower-case
review = review.split() #spliting the review into words
#apply stemming
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
no_stopwords = ["not","don't",'aren','don','ain',"aren't", 'couldn', "couldn't", "wasn't"]
for Nostopword in no_stopwords:
all_stopwords.remove(Nostopword)
review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
review = ' '.join(review)
corpus.append(review)
#Splitting the dataset into Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)
#logistic regression
# Initialize a logistic regression model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score,classification_report
logistic = LogisticRegression(random_state=42, solver='lbfgs',
multi_class='multinomial')
# Train the model
logistic = logistic.fit(X_train, y_train)
y_pred = logistic.predict(X_test)
我不知道如何在同一个 model 中进行 NER、主题建模和情感分析,但我可以在这里向您展示如何使用两个单独的模型进行 NER 和主题建模。 我已经在 NLP 工作了一段时间,并在这个答案中写了我从中提取代码的两篇文章。
您可以使用 NLTK 包在您的语料库上进行 NER,而无需额外训练 model。(代码取自这篇关于命名实体识别的文章)
在终端中运行以获取软件包:
>>> import nltk
>>> nltk.download(“punkt”)
>>> nltk.download(“averaged_perceptron_tagger”)
>>> nltk.download(“maxent_ne_chunker”)
>>> nltk.download(“words”)
Python:
import nltk
text = "Molly Moon is a cow. She is part of the United Nations' Climate Action Committee."
tokenized = nltk.word_tokenize(text)
pos_tagged = nltk.pos_tag(tokenized)
chunks = nltk.ne_chunk(pos_tagged)
for chunk in chunks:
if hasattr(chunk, 'label'):
print(chunk)
对于主题建模,您可以使用 BERTopic。 (代码摘自这篇关于主题建模的文章) docs
变量应该是文档文本列表。
# creating and fitting model
from bertopic import BERTopic
model = BERTopic()
topics, probs = model.fit_transform(docs)
# plotting model
import numpy as np
import pandas as pd
from umap import UMAP
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
# Prepare data for plotting
embeddings = model._extract_embeddings(docs, method="document")
umap_model = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit(embeddings)
df = pd.DataFrame(umap_model.embedding_, columns=["x", "y"])
df["topic"] = topics
# Plot parameters
top_n = 10
fontsize = 12
# Slice data
to_plot = df.copy()
to_plot[df.topic >= top_n] = -1
outliers = to_plot.loc[to_plot.topic == -1]
non_outliers = to_plot.loc[to_plot.topic != -1]
# Visualize topics
cmap = matplotlib.colors.ListedColormap(['#FF5722', # Red
'#03A9F4', # Blue
'#4CAF50', # Green
'#80CBC4', # FFEB3B
'#673AB7', # Purple
'#795548', # Brown
'#E91E63', # Pink
'#212121', # Black
'#00BCD4', # Light Blue
'#CDDC39', # Yellow/Red
'#AED581', # Light Green
'#FFE082', # Light Orange
'#BCAAA4', # Light Brown
'#B39DDB', # Light Purple
'#F48FB1', # Light Pink
])
# Visualize outliers + inliers
fig, ax = plt.subplots(figsize=(15, 15))
scatter_outliers = ax.scatter(outliers['x'], outliers['y'], c="#E0E0E0", s=1, alpha=.3)
scatter = ax.scatter(non_outliers['x'], non_outliers['y'], c=non_outliers['topic'], s=1, alpha=.3, cmap=cmap)
# Add topic names to clusters
centroids = to_plot.groupby("topic").mean().reset_index().iloc[1:]
for row in centroids.iterrows():
topic = int(row[1].topic)
text = f"{topic}: " + "_".join([x[0] for x in model.get_topic(topic)[:3]])
ax.text(row[1].x, row[1].y*1.01, text, fontsize=fontsize, horizontalalignment='center')
ax.text(0.99, 0.01, f"BERTopic - Top {top_n} topics", transform=ax.transAxes, horizontalalignment="right", color="black")
plt.xticks([], [])
plt.yticks([], [])
plt.savefig("BERTopic_Example_Cluster_Plot.png")
plt.show()
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.