簡體   English   中英

如何在 python 中自動化這個重復的過程

[英]How to automate this repetitive process in python

在我收集的 Twitter 數據的 3 天里,我有一個每天在推文中查找最常見單詞的過程。 正如您在下面看到的,我只是重復每天的代碼。 這很好並且可以完成這項工作,盡管我對編程還很陌生,並且感覺有一種更簡潔的方法可以做到這一點,而不是重復代碼 3 次。

關於如何自動化以避免重復代碼的任何幫助?

#strip grouped tweets by day to lists
day1tweets = day1['text'].to_list() 
day2tweets = day2['text'].to_list() 
day3tweets = day3['text'].to_list() 

############# Day 1
# tokenizing tweet text data
day1tweet = [word_tokenize(i) for i in day1tweets]
#splits list into words
day1tweet  = list(itertools.chain(*day1tweet))
#lowercase
day1tweet = [word.lower() for word in day1tweet]
# remove stopwords
day1tweet = [word for word in day1tweet if word not in default_stopwords]
day1tweet = [word for word in day1tweet if word not in custom_stopwords]
#removing 1 letter words and nonetype
day1tweet = [word for word in day1tweet if len(word) > 1]

# obtaining word frequencies, then converting to a dataframe of most common words
d1dist = nltk.FreqDist(day1tweet)
top20d1 =[]
for word, frequency in d1dist.most_common(20):
    top20d1.append(u'{} {}'.format(word, frequency))
top20d1 = pd.DataFrame({'Name': top20d1})
top20d1 = top20d1.Name.str.split(expand=True)
top20d1.columns = {'word', 'freq'}
top20d1.rename(columns = {'freq':'words', 'word':'frequency'}, inplace = True)
top20d1['frequency'] = top20d1['frequency'].astype(int)

# Create horizontal bars
plt.barh(top20d1['words'], top20d1['frequency'], color= '#005EB8')
plt.title('Most common words in user profile descriptions')
plt.ylabel('Top 20 words') 
plt.xlabel('Frequency')  
plt.gca().invert_yaxis()
plt.gca().set_xlim([1000,23000])
plt.gcf().set_size_inches(10, 8)

############# Day 2
# tokenizing tweet text data
day2tweet = [word_tokenize(i) for i in day2tweets]
#splits list into words
day2tweet  = list(itertools.chain(*day2tweet))
#lowercase
day2tweet = [word.lower() for word in day2tweet]
# remove stopwords
day2tweet = [word for word in day2tweet if word not in default_stopwords]
day2tweet = [word for word in day2tweet if word not in custom_stopwords]
#removing 1 letter words and nonetype
day2tweet = [word for word in day2tweet if len(word) > 1]

# obtaining word frequencies, then converting to a dataframe of most common words
d2dist = nltk.FreqDist(day2tweet)
day2tweet =[]
for word, frequency in d2dist.most_common(20):
    day2tweet.append(u'{} {}'.format(word, frequency))
day2tweet = pd.DataFrame({'Name': day2tweet})
day2tweet = day2tweet.Name.str.split(expand=True)
day2tweet.columns = {'word', 'freq'}
day2tweet.rename(columns = {'freq':'words', 'word':'frequency'}, inplace = True)
day2tweet['frequency'] = day2tweet['frequency'].astype(int)

# Create horizontal bars
plt.barh(day2tweet['words'], day2tweet['frequency'], color= '#005EB8')
plt.title('Most common words in user profile descriptions')
plt.ylabel('Top 20 words') 
plt.xlabel('Frequency')  
plt.gca().invert_yaxis()
plt.gca().set_xlim([1000,23000])
plt.gcf().set_size_inches(10, 8)


############# Day 3
# tokenizing tweet text data
day3tweet = [word_tokenize(i) for i in day3tweets]
#splits list into words
day3tweet  = list(itertools.chain(*day3tweet))
#lowercase
day3tweet = [word.lower() for word in day3tweet]
# remove stopwords
day3tweet = [word for word in day3tweet if word not in default_stopwords]
day3tweet = [word for word in day3tweet if word not in custom_stopwords]
#removing 1 letter words and nonetype
day3tweet = [word for word in day3tweet if len(word) > 1]

# obtaining word frequencies, then converting to a dataframe of most common words
d3dist = nltk.FreqDist(day3tweet)
day3tweet =[]
for word, frequency in d3dist.most_common(20):
    day3tweet.append(u'{} {}'.format(word, frequency))
day3tweet = pd.DataFrame({'Name': day3tweet})
day3tweet = day3tweet.Name.str.split(expand=True)
day3tweet.columns = {'word', 'freq'}
day3tweet.rename(columns = {'freq':'words', 'word':'frequency'}, inplace = True)
day3tweet['frequency'] = day3tweet['frequency'].astype(int)

# Create horizontal bars
plt.barh(day3tweet['words'], day3tweet['frequency'], color= '#005EB8')
plt.title('Most common words in user profile descriptions')
plt.ylabel('Top 20 words') 
plt.xlabel('Frequency')  
plt.gca().invert_yaxis()
#plt.gca().set_xlim([1000,34000])
plt.gcf().set_size_inches(10, 8)

您可以像這樣使用 function。 這樣您就不必多次編寫相同的代碼。

def do_something(tweet):
  # tokenizing tweet text data
  tweet = [word_tokenize(i) for i in tweet]
  #splits list into words
  tweet  = list(itertools.chain(*tweet))
  #lowercase
  tweet = [word.lower() for word in tweet]
  # remove stopwords
  tweet = [word for word in tweet if word not in default_stopwords]
  tweet = [word for word in tweet if word not in custom_stopwords]
  #removing 1 letter words and nonetype
  tweet = [word for word in tweet if len(word) > 1]

  # obtaining word frequencies, then converting to a dataframe of most common words
  dist = nltk.FreqDist(tweet)
  top20 =[]
  for word, frequency in dist.most_common(20):
    top20.append(u'{} {}'.format(word, frequency))
  top20 = pd.DataFrame({'Name': top20})
  top20 = top20.Name.str.split(expand=True)
  top20.columns = {'word', 'freq'}
  top20.rename(columns = {'freq':'words', 'word':'frequency'}, inplace = True)
  top20['frequency'] = top20['frequency'].astype(int)

  # Create horizontal bars
  plt.barh(top20['words'], top20['frequency'], color= '#005EB8')
  plt.title('Most common words in user profile descriptions')
  plt.ylabel('Top 20 words') 
  plt.xlabel('Frequency')  
  plt.gca().invert_yaxis()
  plt.gca().set_xlim([1000,23000])
  plt.gcf().set_size_inches(10, 8)

#strip grouped tweets by day to lists
do_somthing(day1['text'].to_list())
do_somthing(day2['text'].to_list())
do_somthing(day3['text'].to_list())

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM