简体   繁体   中英

Assign Topic from NNMF Topic Modelling

I have a list of text comments that are fed into a non-negative matrix factorization topic modelling program.

import pandas as pd
import numpy as np
# load the data
import csv
with open('C:\\...\\comments.csv', newline='') as f:
    reader = csv.reader(f)
    next(reader) # skip header
    df = [tuple(row) for row in reader]

# set the number of topics 
total_topics = 3

# process the data
from nltk.tokenize import word_tokenize
from collections import defaultdict
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from gensim.parsing.preprocessing import remove_stopwords
from nltk.corpus import stopwords

data_text = pd.DataFrame(df,columns=['text'])
# remove stopwords and tokenize the text
custom_stops = ["stopword1", "stopword2", "stopword3"]
data_text['filtered_text'] = data_text['text'].apply(lambda x: remove_stopwords(x.lower()))
data_text['filtered_text'] = data_text['filtered_text'].apply(lambda x: str.split(x))
data_text['filtered_text'] = data_text['filtered_text'].apply(lambda x: [item for item in x if item.lower() not in custom_stops])
CORPUS = pd.DataFrame(data_text['filtered_text'])

# Remove empty strings
CORPUS.dropna(inplace=True)
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
# lemmatize the text
for index,entry in enumerate(CORPUS['filtered_text']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    CORPUS.loc[index,'text_final'] = str(Final_words)

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

def build_feature_matrix(documents, feature_type='frequency'):
    feature_type = feature_type.lower().strip()  
    if feature_type == 'binary':
        vectorizer = CountVectorizer(binary=True, min_df=1,ngram_range=(1, 1))
    elif feature_type == 'frequency':
        vectorizer = CountVectorizer(binary=False, min_df=1,ngram_range=(1, 1))
    elif feature_type == 'tfidf':
        vectorizer = TfidfVectorizer(min_df=1, ngram_range=(1, 1))
    else:
        raise Exception("Wrong feature type entered. Possible values: 'binary', 'frequency', 'tfidf'")
    feature_matrix = vectorizer.fit_transform(documents).astype(float)
    return vectorizer, feature_matrix

# create a feature matrix
vectorizer, tfidf_matrix = build_feature_matrix(CORPUS['text_final'], feature_type='tfidf')
td_matrix = tfidf_matrix.transpose()
td_matrix = td_matrix.multiply(td_matrix > 0)

from sklearn.decomposition import NMF
nmf = NMF(n_components=total_topics, random_state=42, alpha=.1, l1_ratio=.5)
nmf.fit(tfidf_matrix) 

def get_topics_terms_weights(weights, feature_names):
    feature_names = np.array(feature_names)
    sorted_indices = np.array([list(row[::-1]) 
                           for row 
                           in np.argsort(np.abs(weights))])
    sorted_weights = np.array([list(wt[index]) 
                               for wt, index 
                               in zip(weights,sorted_indices)])
    sorted_terms = np.array([list(feature_names[row]) 
                             for row 
                             in sorted_indices])
    
    topics = [np.vstack((terms.T, 
                     term_weights.T)).T 
              for terms, term_weights 
              in zip(sorted_terms, sorted_weights)]     
    return topics

def print_topics_udf(topics, total_topics=1,
                     weight_threshold=0.0001,
                     display_weights=False,
                     num_terms=None):
    
    for index in range(total_topics):
        topic = topics[index]
        topic = [(term, float(wt))
                 for term, wt in topic]
        topic = [(word, round(wt,2)) 
                 for word, wt in topic 
                 if abs(wt) >= weight_threshold]
                     
        if display_weights:
            print( 'Topic #' +str(index+1)+' with weights')
            print (topic[:num_terms] if num_terms else topic)
        else:
            print ('Topic #'+str(index+1)+' without weights')
            tw = [term for term, wt in topic]
            print (tw[:num_terms] if num_terms else tw)
        print()

feature_names = vectorizer.get_feature_names()
weights = nmf.components_

topics = get_topics_terms_weights(weights, feature_names)
# print topics and weights
# print_topics_udf(topics=topics,total_topics=total_topics,num_terms=None,display_weights=False) 
# print topics with weights
# print_topics_udf(topics=topics,total_topics=total_topics,num_terms=None,display_weights=True) 

# display the topics
# this takes the top term from each group and assigns it as the topic theme
for index in range(0,total_topics):
        print("Topic",index+1,"=",topics[index][0][0])

The example output may be something like:

Topic 1 = problem
Topic 2 = software
Topic 3 = recommendation

How can I assign a specific comment from the file a specific topic? eg, the comment "My computer has an issue of turning off intermittently" would be mapped to Topic 1 "problem"

The answer is to transform the document term matrix to pull out the factorized document topic matrix:

W = nmf.fit_transform(tfidf_matrix)

where the tfidf matrix = W x H, where W is the document-topic matrix and H is the topic-term matrix. Slide 25 of the link gives a good visualization of this technique: http://derekgreene.com/slides/topic-modelling-with-scikitlearn.pdf

Thus, the highest value in W for the respective comment row correlates the assigned topic. I iterated across the rows to assign this topics via

data_text['topic'] = ""
for row in range(len(data_text['text'])):
    data_text['topic'][row] = topics[np.argmax(W[row])][0][0]

To extend the example in the question, if the [1] index row value of data_text['text'][1] is "My computer has an issue of turning off intermittently" the W[1][0][0] matrix array may be [0.5412, 0.0201, 0.0] . Since the highest value is in the first column, this sentence should be mapped to the first topic (ie, 'problem' topic). The text assignment of this topic is assigned to data_text['topic'][1] value via topics[np.argmax(W[row])][0][0]

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM