I have to apply some multiple functions to a column to get the list of bigrams but it is painfully slow using the apply function the way I'm currently using. Do you have a way to boost the speed?
def remove_stop_words(text):
cleantext = text.lower()
cleantext = ' '.join(re.sub(r'[^\w]', ' ', cleantext).strip().split())
filtered_sentence= ''
for w in word_tokenize(cleantext):
if w not in stop_words:
filtered_sentence = filtered_sentence + ' ' + w
return filtered_sentence.strip()
def lemmatize(text):
lemma_word = []
for w in word_tokenize(text.lower()):
word1 = wordnet_lemmatizer.lemmatize(w, pos = "n")
word2 = wordnet_lemmatizer.lemmatize(word1, pos = "v")
word3 = wordnet_lemmatizer.lemmatize(word2, pos = ("a"))
lemma_word.append(word3)
return ' '.join(lemma_word)
def get_ngrams(text, n ):
n_grams = ngrams(word_tokenize(text), n=2)
return [ ' '.join(grams) for grams in n_grams]
df['bigrams'] = df.headline.apply(lambda x: get_ngrams(lemmatize(remove_stop_words(x)),n=2))
Edit: (based on comment) The data frame df contains 2 columns - 1. headline 2. Sentiment score
headline - It's news headline, basically text on which I've to apply the function to get the bigrams of the headline
Sentiment Score - I've to keep the score as well in the df dataframe hence need to get a column called "bigram" in the same data frame
I found the best way to do this was to parallelize the process using the multiprocessing library.
import numpy as np
import pandas as pd
import re
import time
from nltk import pos_tag, word_tokenize
from nltk.util import ngrams
import nltk
from nltk.corpus import stopwords
import nltk.data
from nltk.stem import WordNetLemmatizer
import random
from multiprocessing import Pool
def get_ngrams(text, n=2 ):
n_grams = ngrams(text.split(), n=n)
return [ ' '.join(grams) for grams in n_grams]
def bigrams(df):
df['bigrams'] = df.headline.apply(lambda x: get_ngrams(lemmatize(remove_stop_words(x)),n=2))
return df
def parallelize_dataframe(df, func, n_cores=20):
df_split = np.array_split(df, n_cores)
pool = Pool(n_cores)
df = pd.concat(pool.map(func, df_split))
pool.close()
pool.join()
return df
df2 = parallelize_dataframe(df, bigrams)
bigramScore = df2.explode('bigrams')
Note: This is useful only when you have a large number of cores available if you just have 2-3 cores available this might not be the best approach as the overhead cost of parallelizing the process is also to be considered.
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.