简体   繁体   中英

Read All Emails from Outlook and add the word counts to a DataFrame

import win32com.client
import os
import time
import datetime as dt
import re
import pandas as pd

DateFilter = dt.datetime.now() - dt.timedelta(days = 2)

#Outlook MAPI
outlook = win32com.client.Dispatch("Outlook.Application").GetNamespace("MAPI")
#Inbox Folder
inbox = outlook.GetDefaultFolder(6)
root_folder = outlook.Folders.Item(1)
print(root_folder.Name)
#Sort emails in inbox
messages = inbox.Items
messages.Sort("[ReceivedTime]", True)
#Filter emails to go through
DateFilterMsg = messages.Restrict("[ReceivedTime] >= '" + DateFilter.strftime('%m/%d/%Y %H:%M %p')+"'")

for message in messages:
    if message.Class==43: # Get COM Objects. 
        body = message.HTMLBody.lower()
        content = ('Body: {}'.format(body))#format 
        words = message.Body.split(" ")#split
        words.sort()#sort
        uniquewords = list((word,words.count(word)) for word in set(words))       
        df = pd.DataFrame(uniquewords, columns = ['word_name','word_cnt']) #Word Name and respective Counts
        print(df)

This is how get the output, May be it is breaking the word counts by email. I want to see all the word counts in one dataFrame and not by breakUp. For example word system should give me a count of 2 and not two separate tables.

Index word_name word_cnt
0 System 1

Index word_name word_cnt
0 System 1

The problem with the version of your code was that:

In the for loop, the code was generating a new dataframe for each message and printing it, hence you could see the word counts of each message in a separate dataframe.

The changes that I have made are, by defining the main dataframe outside the for loop. Creating a new dataframe 'df2' for storing word counts of each message and then further appending it to the 'df' inside the loop itself.

At the end, outside the for loop, we need to group by the 'word_name' and then sum up the count values to get aggregate count of a particular word in all the messages of the whole mailbox.


    import win32com.client
    import os
    import time
    import datetime as dt
    import re
    import pandas as pd
    
    pd.set_option('display.max_rows', None)
    DateFilter = dt.datetime.now() - dt.timedelta(days = 2)
    
    #Outlook MAPI
    outlook = win32com.client.Dispatch("Outlook.Application").GetNamespace("MAPI")
    #Inbox Folder
    inbox = outlook.GetDefaultFolder(6)
    root_folder = outlook.Folders.Item(1)
    print(root_folder.Name)
    #Sort emails in inbox
    messages = inbox.Items
    messages.Sort("[ReceivedTime]", True)
    #Filter emails to go through
    DateFilterMsg = messages.Restrict("[ReceivedTime] >= '" + DateFilter.strftime('%m/%d/%Y %H:%M %p')+"'")
    df = pd.DataFrame(columns = ['word_name','word_cnt'])
    for message in messages:
        if message.Class==43: # Get COM Objects. 
            body = message.HTMLBody.lower()
            content = ('Body: {}'.format(body))#format 
            words = message.Body.split(" ")#split
            words.sort()#sort
            uniquewords = list((word,words.count(word)) for word in set(words))
            df2 = pd.DataFrame(uniquewords, columns = ['word_name','word_cnt'])
            df = df.append(df2) #Word Name and respective Counts
    df = df.groupby('word_name', as_index=False).sum()                
    print(df)

New Answer


    import win32com.client
    import os
    import time
    import datetime as dt
    import re
    import pandas as pd
    
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', 30)
    DateFilter = dt.datetime.now() - dt.timedelta(days = 2)
    
    #Outlook MAPI
    outlook = win32com.client.Dispatch("Outlook.Application").GetNamespace("MAPI")
    #Inbox Folder
    inbox = outlook.GetDefaultFolder(6)
    root_folder = outlook.Folders.Item(1)
    print(root_folder.Name)
    #Sort emails in inbox
    messages = inbox.Items
    messages.Sort("[ReceivedTime]", True)
    #Filter emails to go through
    DateFilterMsg = messages.Restrict("[ReceivedTime] >= '" + DateFilter.strftime('%m/%d/%Y %H:%M %p')+"'")
    df = pd.DataFrame(columns = ['word_name','word_cnt'])
    for message in messages:
        if message.Class==43: # Get COM Objects. 
            body = message.HTMLBody.lower()
            content = ('Body: {}'.format(body))#format 
            words = message.Body.split(" ")#split
            words.sort()#sort
            uniquewords = list((word, words.count(word)) for word in set(words) if word.isalpha())
            df2 = pd.DataFrame(uniquewords, columns = ['word_name','word_cnt'])
            df = df.append(df2) #Word Name and respective Counts
    df = df.groupby('word_name', as_index=False).sum()                
    print(df)

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM