Read All Emails from Outlook and add the word counts to a DataFrame

Question

import win32com.client
import os
import time
import datetime as dt
import re
import pandas as pd

DateFilter = dt.datetime.now() - dt.timedelta(days = 2)

#Outlook MAPI
outlook = win32com.client.Dispatch("Outlook.Application").GetNamespace("MAPI")
#Inbox Folder
inbox = outlook.GetDefaultFolder(6)
root_folder = outlook.Folders.Item(1)
print(root_folder.Name)
#Sort emails in inbox
messages = inbox.Items
messages.Sort("[ReceivedTime]", True)
#Filter emails to go through
DateFilterMsg = messages.Restrict("[ReceivedTime] >= '" + DateFilter.strftime('%m/%d/%Y %H:%M %p')+"'")

for message in messages:
    if message.Class==43: # Get COM Objects. 
        body = message.HTMLBody.lower()
        content = ('Body: {}'.format(body))#format 
        words = message.Body.split(" ")#split
        words.sort()#sort
        uniquewords = list((word,words.count(word)) for word in set(words))       
        df = pd.DataFrame(uniquewords, columns = ['word_name','word_cnt']) #Word Name and respective Counts
        print(df)

This is how get the output, May be it is breaking the word counts by email. I want to see all the word counts in one dataFrame and not by breakUp. For example word system should give me a count of 2 and not two separate tables.

Index	word_name	word_cnt
0	System	1

Index	word_name	word_cnt
0	System	1

Answer 1

The problem with the version of your code was that:

In the for loop, the code was generating a new dataframe for each message and printing it, hence you could see the word counts of each message in a separate dataframe.

The changes that I have made are, by defining the main dataframe outside the for loop. Creating a new dataframe 'df2' for storing word counts of each message and then further appending it to the 'df' inside the loop itself.

At the end, outside the for loop, we need to group by the 'word_name' and then sum up the count values to get aggregate count of a particular word in all the messages of the whole mailbox.


    import win32com.client
    import os
    import time
    import datetime as dt
    import re
    import pandas as pd
    
    pd.set_option('display.max_rows', None)
    DateFilter = dt.datetime.now() - dt.timedelta(days = 2)
    
    #Outlook MAPI
    outlook = win32com.client.Dispatch("Outlook.Application").GetNamespace("MAPI")
    #Inbox Folder
    inbox = outlook.GetDefaultFolder(6)
    root_folder = outlook.Folders.Item(1)
    print(root_folder.Name)
    #Sort emails in inbox
    messages = inbox.Items
    messages.Sort("[ReceivedTime]", True)
    #Filter emails to go through
    DateFilterMsg = messages.Restrict("[ReceivedTime] >= '" + DateFilter.strftime('%m/%d/%Y %H:%M %p')+"'")
    df = pd.DataFrame(columns = ['word_name','word_cnt'])
    for message in messages:
        if message.Class==43: # Get COM Objects. 
            body = message.HTMLBody.lower()
            content = ('Body: {}'.format(body))#format 
            words = message.Body.split(" ")#split
            words.sort()#sort
            uniquewords = list((word,words.count(word)) for word in set(words))
            df2 = pd.DataFrame(uniquewords, columns = ['word_name','word_cnt'])
            df = df.append(df2) #Word Name and respective Counts
    df = df.groupby('word_name', as_index=False).sum()                
    print(df)

Answer 2

New Answer


    import win32com.client
    import os
    import time
    import datetime as dt
    import re
    import pandas as pd
    
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', 30)
    DateFilter = dt.datetime.now() - dt.timedelta(days = 2)
    
    #Outlook MAPI
    outlook = win32com.client.Dispatch("Outlook.Application").GetNamespace("MAPI")
    #Inbox Folder
    inbox = outlook.GetDefaultFolder(6)
    root_folder = outlook.Folders.Item(1)
    print(root_folder.Name)
    #Sort emails in inbox
    messages = inbox.Items
    messages.Sort("[ReceivedTime]", True)
    #Filter emails to go through
    DateFilterMsg = messages.Restrict("[ReceivedTime] >= '" + DateFilter.strftime('%m/%d/%Y %H:%M %p')+"'")
    df = pd.DataFrame(columns = ['word_name','word_cnt'])
    for message in messages:
        if message.Class==43: # Get COM Objects. 
            body = message.HTMLBody.lower()
            content = ('Body: {}'.format(body))#format 
            words = message.Body.split(" ")#split
            words.sort()#sort
            uniquewords = list((word, words.count(word)) for word in set(words) if word.isalpha())
            df2 = pd.DataFrame(uniquewords, columns = ['word_name','word_cnt'])
            df = df.append(df2) #Word Name and respective Counts
    df = df.groupby('word_name', as_index=False).sum()                
    print(df)

Read All Emails from Outlook and add the word counts to a DataFrame

Question

2 answers

solution1
0 2022-09-15 09:49:41

solution2
0 2022-09-16 11:07:22

Read All Emails from Outlook and add the word counts to a DataFrame

Question

2 answers

solution1 0 2022-09-15 09:49:41

solution2 0 2022-09-16 11:07:22

solution1
0 2022-09-15 09:49:41

solution2
0 2022-09-16 11:07:22