I have this code below and I would to find how it can be re-written to be more efficient (ie faster and more readable) , given also that my actual vocabulary consists of 100k words and not 17 as below.
The goal is to count from a list of strings how many times (as a percentage) a word appears per group of these strings.
So for example at the descriptions below the word elections
appears in 50% of the Politics
group and in 0% of the Economics
group.
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
import time
start_time = time.time()
descriptions = ['New elections are scheduled for 2023', 'The unemployment is rising',
' The new Prime Minister worked as a software engineer before']
labels = ['Politics', 'Economics', 'Politics']
vectorizer = CountVectorizer(analyzer="word", ngram_range=(1, 1),
min_df=1) # in a proper dataset I would set mind_df=5 or more
vectorizer.fit(descriptions)
vocabulary = vectorizer.vocabulary_
vocabulary = list(vocabulary.keys())
labels_counts = dict(Counter(labels))
unique_labels = list(labels_counts.keys())
unique_labels_indexes = {label:index for index, label in enumerate(unique_labels)}
labels_number = len(unique_labels)
print(labels_counts)
words_dict = {}
words_dict_count = {}
for word in vocabulary:
words_dict[word] = [0] * labels_number
words_dict_count[word] = [0] * labels_number
for index, description in enumerate(descriptions):
for word in description.split():
word = word.lower()
if word in vocabulary:
label_index = unique_labels_indexes[labels[index]]
words_dict_count[word][label_index] += 1
words_dict[word][label_index] = words_dict_count[word][label_index] / labels_counts[labels[label_index]]
# words_dict should look like: {'new': [1.0, 0], 'elections': [0.5, 0], 'are': [0.5, 0], 'scheduled': [0.5, 0], 'for': [0.5, 0], '2023': [0.5, 0], 'the': [0.5, 1.0], 'unemployment': [0, 1.0], 'is': [0, 1.0], 'rising': [0, 1.0], 'prime': [0.5, 0], 'minister': [0.5, 0], 'worked': [0.5, 0], 'as': [0.5, 0], 'software': [0.5, 0], 'engineer': [0.5, 0], 'before': [0.5, 0]}
end_time = time.time()
print('Time:', round(end_time - start_time, 3), 'seconds')
Can you test and see if this code is faster?
import re
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict
import time
def printPretty(name: str, value):
print("================")
print(name, value)
def getWordIndex(vocabularyDictionary: dict, word: str) -> int:
return vocabularyDictionary[word]
def getFrequency(frequencyArray: list, Wordindex: int) -> int:
return frequencyArray[Wordindex]
def getLabelIndices(labels: list) -> dict:
indices = defaultdict(list)
for index, name in enumerate(labels):
indices[name].append(index)
return indices
def getTotalFrequency(word: str, group: str, labels: list, allFrequencies: list, vocabularyDictionary: dict) -> int:
adder = 0
groupIndices = getLabelIndices(labels)[group]
for index in groupIndices:
adder += getFrequency(
allFrequencies[index],
getWordIndex(vocabularyDictionary, word),
)
return adder/len(groupIndices)
start_time = time.time()
descriptions = ['New elections are scheduled for 2023', 'The unemployment is rising',
' The new Prime Minister worked as a software engineer before']
labels = ['Politics', 'Economics', 'Politics']
# in a proper dataset I would set mind_df=5 or more
vectorizer = CountVectorizer(analyzer="word", ngram_range=(1, 1), min_df=1)
vectorizer.fit(descriptions)
vocabularyDict = vectorizer.vocabulary_
# Encode the Document
vector = vectorizer.transform(descriptions)
# Summarizing the Encoded Texts
vectorArray = vector.toarray()
results = defaultdict(list)
for group in set(labels):
for word in vocabularyDict.keys():
results[word].append([group,
getTotalFrequency(
word,
group,
labels,
vectorArray,
vocabularyDict
)
])
print(results)
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.