import win32com.client
import os
import time
import datetime as dt
import re
import pandas as pd
DateFilter = dt.datetime.now() - dt.timedelta(days = 2)
#Outlook MAPI
outlook = win32com.client.Dispatch("Outlook.Application").GetNamespace("MAPI")
#Inbox Folder
inbox = outlook.GetDefaultFolder(6)
root_folder = outlook.Folders.Item(1)
print(root_folder.Name)
#Sort emails in inbox
messages = inbox.Items
messages.Sort("[ReceivedTime]", True)
#Filter emails to go through
DateFilterMsg = messages.Restrict("[ReceivedTime] >= '" + DateFilter.strftime('%m/%d/%Y %H:%M %p')+"'")
for message in messages:
if message.Class==43: # Get COM Objects.
body = message.HTMLBody.lower()
content = ('Body: {}'.format(body))#format
words = message.Body.split(" ")#split
words.sort()#sort
uniquewords = list((word,words.count(word)) for word in set(words))
df = pd.DataFrame(uniquewords, columns = ['word_name','word_cnt']) #Word Name and respective Counts
print(df)
This is how get the output, May be it is breaking the word counts by email. I want to see all the word counts in one dataFrame and not by breakUp. For example word system should give me a count of 2 and not two separate tables.
Index | word_name | word_cnt |
---|---|---|
0 | System | 1 |
Index | word_name | word_cnt |
---|---|---|
0 | System | 1 |
The problem with the version of your code was that:
In the for loop, the code was generating a new dataframe for each message and printing it, hence you could see the word counts of each message in a separate dataframe.
The changes that I have made are, by defining the main dataframe outside the for loop. Creating a new dataframe 'df2' for storing word counts of each message and then further appending it to the 'df' inside the loop itself.
At the end, outside the for loop, we need to group by the 'word_name' and then sum up the count values to get aggregate count of a particular word in all the messages of the whole mailbox.
import win32com.client
import os
import time
import datetime as dt
import re
import pandas as pd
pd.set_option('display.max_rows', None)
DateFilter = dt.datetime.now() - dt.timedelta(days = 2)
#Outlook MAPI
outlook = win32com.client.Dispatch("Outlook.Application").GetNamespace("MAPI")
#Inbox Folder
inbox = outlook.GetDefaultFolder(6)
root_folder = outlook.Folders.Item(1)
print(root_folder.Name)
#Sort emails in inbox
messages = inbox.Items
messages.Sort("[ReceivedTime]", True)
#Filter emails to go through
DateFilterMsg = messages.Restrict("[ReceivedTime] >= '" + DateFilter.strftime('%m/%d/%Y %H:%M %p')+"'")
df = pd.DataFrame(columns = ['word_name','word_cnt'])
for message in messages:
if message.Class==43: # Get COM Objects.
body = message.HTMLBody.lower()
content = ('Body: {}'.format(body))#format
words = message.Body.split(" ")#split
words.sort()#sort
uniquewords = list((word,words.count(word)) for word in set(words))
df2 = pd.DataFrame(uniquewords, columns = ['word_name','word_cnt'])
df = df.append(df2) #Word Name and respective Counts
df = df.groupby('word_name', as_index=False).sum()
print(df)
New Answer
import win32com.client
import os
import time
import datetime as dt
import re
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', 30)
DateFilter = dt.datetime.now() - dt.timedelta(days = 2)
#Outlook MAPI
outlook = win32com.client.Dispatch("Outlook.Application").GetNamespace("MAPI")
#Inbox Folder
inbox = outlook.GetDefaultFolder(6)
root_folder = outlook.Folders.Item(1)
print(root_folder.Name)
#Sort emails in inbox
messages = inbox.Items
messages.Sort("[ReceivedTime]", True)
#Filter emails to go through
DateFilterMsg = messages.Restrict("[ReceivedTime] >= '" + DateFilter.strftime('%m/%d/%Y %H:%M %p')+"'")
df = pd.DataFrame(columns = ['word_name','word_cnt'])
for message in messages:
if message.Class==43: # Get COM Objects.
body = message.HTMLBody.lower()
content = ('Body: {}'.format(body))#format
words = message.Body.split(" ")#split
words.sort()#sort
uniquewords = list((word, words.count(word)) for word in set(words) if word.isalpha())
df2 = pd.DataFrame(uniquewords, columns = ['word_name','word_cnt'])
df = df.append(df2) #Word Name and respective Counts
df = df.groupby('word_name', as_index=False).sum()
print(df)
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.