如何使用主题建模或 NER [Python] 进行情感分析？

Question

我有以下用于情绪分析的代码。 我想知道如何在其中包含主题建模或 NER？ （数据集是客户对 3 个网站的评论，一个 csv 文件，有 2 列，一个是评论，一个是评分，0 为负面，1 为正面）

 from nltk.corpus import stopwords
    from nltk.stem.porter import PorterStemmer
    
    dataset = pd.read_csv('full_db.csv') 
    X = dataset.iloc[:,0].values
    y = dataset.iloc[:, 1].values
    
    corpus = []
    
    for i in range(0, len(X)):
        review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])  #replace punctuations with space
        review = review.lower()  #transfering all the letters to lower-case
        review = review.split()  #spliting the review into words
        #apply stemming
        ps = PorterStemmer()
        all_stopwords = stopwords.words('english')
        no_stopwords = ["not","don't",'aren','don','ain',"aren't", 'couldn', "couldn't", "wasn't"]
        for Nostopword in no_stopwords:
            all_stopwords.remove(Nostopword)
        review = [ps.stem(word) for word in review if not word in set(all_stopwords)] 
        review = ' '.join(review)
        corpus.append(review)

    #Splitting the dataset into Training set and Test set
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)
    
    #logistic regression
    # Initialize a logistic regression model 
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import confusion_matrix, accuracy_score,classification_report
    logistic = LogisticRegression(random_state=42, solver='lbfgs',
                                multi_class='multinomial')
    # Train the model
    logistic = logistic.fit(X_train, y_train)
    y_pred = logistic.predict(X_test)

Answer 1

我不知道如何在同一个 model 中进行 NER、主题建模和情感分析，但我可以在这里向您展示如何使用两个单独的模型进行 NER 和主题建模。 我已经在 NLP 工作了一段时间，并在这个答案中写了我从中提取代码的两篇文章。

您可以使用 NLTK 包在您的语料库上进行 NER，而无需额外训练 model。（代码取自这篇关于命名实体识别的文章）

在终端中运行以获取软件包：

>>> import nltk
>>> nltk.download(“punkt”)
>>> nltk.download(“averaged_perceptron_tagger”)
>>> nltk.download(“maxent_ne_chunker”)
>>> nltk.download(“words”)

Python：

import nltk
 
text = "Molly Moon is a cow. She is part of the United Nations' Climate Action Committee."
 
tokenized = nltk.word_tokenize(text)
pos_tagged = nltk.pos_tag(tokenized)
chunks = nltk.ne_chunk(pos_tagged)
for chunk in chunks:
    if hasattr(chunk, 'label'):
        print(chunk)

对于主题建模，您可以使用 BERTopic。 （代码摘自这篇关于主题建模的文章） docs变量应该是文档文本列表。

# creating and fitting model
from bertopic import BERTopic
model = BERTopic()
topics, probs = model.fit_transform(docs)
# plotting model
import numpy as np
import pandas as pd
from umap import UMAP
 
import matplotlib
import matplotlib.pyplot as plt
 
%matplotlib inline
 
# Prepare data for plotting
embeddings = model._extract_embeddings(docs, method="document")
umap_model = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit(embeddings)
df = pd.DataFrame(umap_model.embedding_, columns=["x", "y"])
df["topic"] = topics
 
# Plot parameters
top_n = 10
fontsize = 12
 
# Slice data
to_plot = df.copy()
to_plot[df.topic >= top_n] = -1
outliers = to_plot.loc[to_plot.topic == -1]
non_outliers = to_plot.loc[to_plot.topic != -1]
 
# Visualize topics
cmap = matplotlib.colors.ListedColormap(['#FF5722', # Red
                                        '#03A9F4', # Blue
                                        '#4CAF50', # Green
                                        '#80CBC4', # FFEB3B
                                        '#673AB7', # Purple
                                        '#795548', # Brown
                                        '#E91E63', # Pink
                                        '#212121', # Black
                                        '#00BCD4', # Light Blue
                                        '#CDDC39', # Yellow/Red
                                        '#AED581', # Light Green
                                        '#FFE082', # Light Orange
                                        '#BCAAA4', # Light Brown
                                        '#B39DDB', # Light Purple
                                        '#F48FB1', # Light Pink
                                        ])
 
# Visualize outliers + inliers
fig, ax = plt.subplots(figsize=(15, 15))
scatter_outliers = ax.scatter(outliers['x'], outliers['y'], c="#E0E0E0", s=1, alpha=.3)
scatter = ax.scatter(non_outliers['x'], non_outliers['y'], c=non_outliers['topic'], s=1, alpha=.3, cmap=cmap)
 
# Add topic names to clusters
centroids = to_plot.groupby("topic").mean().reset_index().iloc[1:]
for row in centroids.iterrows():
   topic = int(row[1].topic)
   text = f"{topic}: " + "_".join([x[0] for x in model.get_topic(topic)[:3]])
   ax.text(row[1].x, row[1].y*1.01, text, fontsize=fontsize, horizontalalignment='center')
 
ax.text(0.99, 0.01, f"BERTopic - Top {top_n} topics", transform=ax.transAxes, horizontalalignment="right", color="black")
plt.xticks([], [])
plt.yticks([], [])
plt.savefig("BERTopic_Example_Cluster_Plot.png")
plt.show()

如何使用主题建模或 NER [Python] 进行情感分析？

问题描述

1 个解决方案

解决方案1
0 2022-12-29 01:09:33

如何使用主题建模或 NER [Python] 进行情感分析？

问题描述

1 个解决方案

解决方案1 0 2022-12-29 01:09:33

解决方案1
0 2022-12-29 01:09:33