I'm trying to build a utility where a dataset will be processed by the NMF model every couple of days. For this in the first run, I'm providing with a starting value for the number of topics. How can I calculate the coherence score for this entire dataset? I'm planning to use this calculated score to rebuild the model so that it'll be more accurate. Below is the code that I've used.
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
import pandas as pd
import clr
#PLOTTING TOOLS
# import matplotlib.pyplot as PLOTTING
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore", category = DeprecationWarning)
dataset = pd.read_json('out.json', lines = True)
documents = dataset['attachment']
no_features = 1000
no_topics = 9
# print ('Old number of topics: ', no_topics)
tfidf_vectorizer = TfidfVectorizer(max_df = 0.95, min_df = 2, max_features = no_features, stop_words = 'english', norm='l2')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
no_topics = tfidf.shape
retrain_value = no_topics[0]
# print('New number of topics :', retrain_value)
nmf = NMF(n_components = retrain_value, random_state = 1, alpha = .1, l1_ratio = .5, init = 'nndsvd').fit(tfidf)
def display_topics(model, feature_names, no_top_words):
for topic_idx, topic in enumerate(model.components_):
print ("Topic %d: " % (topic_idx))
print (" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words -1:-1]]))
no_top_words = 20
display_topics(nmf, tfidf_feature_names, no_top_words)
Unfortunately there is no out-of-the-box coherence model for sklearn.decomposition.NMF
.
I've had the very same issue and found a custom implementation that is working with python 3.8.
It should be easy to adapt to your code. Please check the link for full imports, etc.
A snipptet from my recent usage of this technique:
kmin, kmax = 2, 30
topic_models = []
# try each value of k
for k in range(kmin,kmax+1):
print("Applying NMF for k=%d ..." % k )
# run NMF
model = decomposition.NMF( init="nndsvd", n_components=k )
W = model.fit_transform( A )
H = model.components_
# store for later
topic_models.append( (k,W,H) )
class TokenGenerator:
def __init__( self, documents, stopwords ):
self.documents = documents
self.stopwords = stopwords
self.tokenizer = re.compile( r"(?u)\b\w\w+\b" )
def __iter__( self ):
print("Building Word2Vec model ...")
for doc in self.documents:
tokens = []
for tok in self.tokenizer.findall( doc ):
if tok.lower() in self.stopwords:
tokens.append( "<stopword>" )
elif len(tok) >= 2:
tokens.append( tok.lower() )
yield tokens
docgen = TokenGenerator(docs_raw, stop_words)
w2v_model = gensim.models.Word2Vec(docgen, size=500, min_count=20, sg=1)
def calculate_coherence( w2v_model, term_rankings ):
overall_coherence = 0.0
for topic_index in range(len(term_rankings)):
# check each pair of terms
pair_scores = []
for pair in combinations( term_rankings[topic_index], 2 ):
#print(str(pair[0]) + " " + str(pair[1]))
pair_scores.append( w2v_model.similarity(pair[0], pair[1]))
# get the mean for all pairs in this topic
topic_score = sum(pair_scores) / len(pair_scores)
overall_coherence += topic_score
# get the mean score across all topics
return overall_coherence / len(term_rankings)
def get_descriptor( all_terms, H, topic_index, top ):
# reverse sort the values to sort the indices
top_indices = np.argsort( H[topic_index,:] )[::-1]
# now get the terms corresponding to the top-ranked indices
top_terms = []
for term_index in top_indices[0:top]:
top_terms.append( all_terms[term_index] )
return top_terms
k_values = []
coherences = []
for (k,W,H) in topic_models:
# Get all of the topic descriptors - the term_rankings, based on top 10 terms
term_rankings = []
for topic_index in range(k):
term_rankings.append( get_descriptor( terms, H, topic_index, 10 ) )
# Now calculate the coherence based on our Word2vec model
k_values.append( k )
coherences.append( calculate_coherence( w2v_model, term_rankings ) )
print("K=%02d: Coherence=%.4f" % ( k, coherences[-1] ) )
%matplotlib inline
plt.style.use("ggplot")
matplotlib.rcParams.update({"font.size": 14})
fig = plt.figure(figsize=(13,7))
# create the line plot
ax = plt.plot( k_values, coherences )
plt.xticks(k_values)
plt.xlabel("Number of Topics")
plt.ylabel("Mean Coherence")
# add the points
plt.scatter( k_values, coherences, s=120)
# find and annotate the maximum point on the plot
ymax = max(coherences)
xpos = coherences.index(ymax)
best_k = k_values[xpos]
plt.annotate( "k=%d" % best_k, xy=(best_k, ymax), xytext=(best_k, ymax), textcoords="offset points", fontsize=16)
# show the plot
plt.show()
Results:
K=02: Coherence=0.4157
K=03: Coherence=0.4399
K=04: Coherence=0.4626
K=05: Coherence=0.4333
K=06: Coherence=0.4075
K=07: Coherence=0.4121
...
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.