Calculate Duration and Average of CSV file data using python

Question

I have a csv file and what I want to do is create a script where user inputs the source ip and destination ip. Once that matches in the csv file. It will take all the source and destination ip the user entered and calculate the time difference between the sessions of multiple matches of user input of a source and destination IP's. Lastly, the script will do the average of the duration as well. Below is an example of my csv column A data however the csv has several column like Time, Source Ip, and Destination IP.Instead of using three different column we can just use Column A that has three info we need already.

_raw

2013-07-18 04:54:15.871 UDP 172.12.332.11:20547 172.12.332.11:20547 -> 172.56.213.80:53 CREATE Ignore 0

2013-07-18 04:54:15.841 UDP 192.33.230.81:37192 192.81.130.82:37192 -> 172.81.123.70:53 CREATE Ignore 0

2013-07-18 04:54:15.831 TCP 172.12.332.11:42547 172.12.332.11:42547
-> 172.56.213.80:53 CREATE Ignore 0

Below is my code in python that does not work anymore. All that happens now is it skips the ip's and does nothing. Please help me fix as I am lost to why it does not work.

My code in python:

import sys
from sys import argv
from datetime import datetime, timedelta

script, source, destination, filename = argv #assign the script arguments to variables
line_num = 0 #for keeping track of the current line number
count = 0 #for counting occurrences of source/destination IPs
occurrences = [] 
#array to store all of the matching occurrences of source/destination IPs

line_array = [] #array to store line numbers
avg = 0 #average
total = 0 #sum of microseconds

#function for converting timedelta to microseconds
def timedelta_to_microtime(td):
return td.microseconds + (td.seconds + td.days * 86400) * 1000000
#use 'try' to catch IOexception
try:
for line in open(filename):
        #if the first character is a number, read line

            if line[0].isdigit():
           if source and destination in line:
            #increment counter for each occurrence of matching  IP combination
            count+=1
            #get the first 23 characters from the line (the date/time)
             #and convert it to a datetime object using the "%Y-%m-%d %H:%M:%S.%f"
             #format, then add it to the array named "occurrences."
        occurrences.append(datetime.strptime(line[:23], '%Y-%m-%d %H:%M:%S.%f'))
            line_array.append(line_num)
        #if the first character is not a number, it's the headers, skip them
        else:
            line_num += 2
            continue #go to next line
        line_num += 1 #counter to keep track of line (solely for testing purposes)
#if the script can't find the data file, notify user and terminate

except IOError:
    print "\n[ERROR]: Cannot read data file, check file name and try again."
    sys.exit()

print "\nFound %s matches for [source: %s] and [destination: %s]:\n" % (len(occurrences), source, destination)

if len(occurrences) != 0: 
#if there are no occurrences, there aren't any times to show! so don't print this line
    print "Time between adjacent connections:\n"

for i in range(len(occurrences)):
if i == 0:
        continue #if it is the first slot in the array, continue to next              slot (can't  subtract from array[0-1] slot)
else:
    #find difference in timedate objects (returns difference in timedelta object)
       difference = (occurrences[i-1]-occurrences[i])
       #for displaying line numbers
       time1 = line_array[i-1]
       time2 = line_array[i]
       #convert timedelta object to microseconds for computing average
       time_m = timedelta_to_microtime(difference)
       #add current microseconds to existing microseconds
       total += time_m
       print "Line %s and Line %s: %s" % (time1, time2, difference)

#check to make sure there are things to take the average of
if len(occurrences) != 0:
    #compute average
    #line read as: total divided by the length of the occurrences array as a float
    #minus 1, divided by 1,000,000 (to convert microseconds back into seconds)
    avg = (total / float((len(occurrences)-1)))/1000000
    print "\nAverage: %s seconds" % (avg)

Answer 1

You can solve this problem much easier if you use a high level library like pandas. Let me demonstrate:

Let's say you have a next data file saved in file.csv :

2013-07-18 04:54:15.871 UDP 172.12.332.11:20547 172.12.332.11:20547 -> 172.56.213.80:53 CREATE Ignore 0
2013-07-18 04:54:15.841 UDP 192.33.230.81:37192 192.81.130.82:37192 -> 172.81.123.70:53 CREATE Ignore 0
2013-07-18 04:54:15.831 TCP 172.12.332.11:42547 172.12.332.11:42547 -> 172.56.213.80:53 CREATE Ignore 0
2013-07-18 04:54:15.821 UDP 192.33.230.81:37192 192.81.130.82:37192 -> 172.81.123.70:53 CREATE Ignore 0
2013-07-18 04:54:15.811 TCP 172.12.332.11:42547 172.12.332.11:42547 -> 172.56.213.80:53 CREATE Ignore 0

First we read it into a data frame:

>>> df = pd.read_table('file.csv', sep=' ', header=None, parse_dates=[[0,1]])
>>> print df.to_string()
                         0_1    2                    3                    4   5                 6       7       8  9
0 2013-07-18 04:54:15.871000  UDP  172.12.332.11:20547  172.12.332.11:20547  ->  172.56.213.80:53  CREATE  Ignore  0
1 2013-07-18 04:54:15.841000  UDP  192.33.230.81:37192  192.81.130.82:37192  ->  172.81.123.70:53  CREATE  Ignore  0
2 2013-07-18 04:54:15.831000  TCP  172.12.332.11:42547  172.12.332.11:42547  ->  172.56.213.80:53  CREATE  Ignore  0
3 2013-07-18 04:54:15.821000  UDP  192.33.230.81:37192  192.81.130.82:37192  ->  172.81.123.70:53  CREATE  Ignore  0
4 2013-07-18 04:54:15.811000  TCP  172.12.332.11:42547  172.12.332.11:42547  ->  172.56.213.80:53  CREATE  Ignore  0

We need only the 0_1, 4th and 6th column

>> df = df[['0_1', 4, 6]]
>> print df.to_string()
                         0_1                    4                 6
0 2013-07-18 04:54:15.871000  172.12.332.11:20547  172.56.213.80:53
1 2013-07-18 04:54:15.841000  192.81.130.82:37192  172.81.123.70:53
2 2013-07-18 04:54:15.831000  172.12.332.11:42547  172.56.213.80:53
3 2013-07-18 04:54:15.821000  192.81.130.82:37192  172.81.123.70:53
4 2013-07-18 04:54:15.811000  172.12.332.11:42547  172.56.213.80:53

Then we should fix the IP addresses and remove ports:

>>> df[4] = df[4].str.split(':').str.get(0)
>>> df[6] = df[6].str.split(':').str.get(0)
>>> print df.to_string()
                         0_1              4              6
0 2013-07-18 04:54:15.871000  172.12.332.11  172.56.213.80
1 2013-07-18 04:54:15.841000  192.81.130.82  172.81.123.70
2 2013-07-18 04:54:15.831000  172.12.332.11  172.56.213.80
3 2013-07-18 04:54:15.821000  192.81.130.82  172.81.123.70
4 2013-07-18 04:54:15.811000  172.12.332.11  172.56.213.80

Lets say you are interested for the source address 172.12.332.11 and destination 172.56.213.80 . We will filter out just those:

>>> filtered = df[(df[4] == '172.12.332.11') & (df[6] == '172.56.213.80')]
>>> print filtered.to_string()
                         0_1              4              6
0 2013-07-18 04:54:15.871000  172.12.332.11  172.56.213.80
2 2013-07-18 04:54:15.831000  172.12.332.11  172.56.213.80
4 2013-07-18 04:54:15.811000  172.12.332.11  172.56.213.80

Now we need to calculate the difference between the timestamps:

>>> timestamps = filtered['0_1']
>>> diffs = (timestamps.shift() - timestamps).dropna()
>>> print diffs.to_string()
2   00:00:00.040000
4   00:00:00.020000

And we can now calculate whatever statistics we want:

>>> diffs.mean() # this is in nanoseconds
30000000.0
>>> diffs.std()
14142135.62373095

Edit: For the data you sent me

import io
import pandas as pd

def load_dataframe(filename):
    # First you read the data as a regular csv file and extract the _raw column values
    values = pd.read_csv(filename)['_raw'].values
    # Cleanup the values: remove newline character
    values = map(lambda x: x.replace('\n', ' '), values)
    # Add them to a stream
    s = io.StringIO(u'\n'.join(values))
    # And now everithing is the same just read it from the stream
    df = pd.read_table(s, sep='\s+', header=None, parse_dates=[[0,1]])[['0_1',4, 6]]
    df[4] = df[4].str.split(':').str.get(0)
    df[6] = df[6].str.split(':').str.get(0)
    return df

def get_diffs(df, source, destination):
    timestamps = df[(df[4] == source) & (df[6] == destination)]['0_1']
    return (timestamps.shift() - timestamps).dropna()


def main():
    filename = raw_input('Enter filename: ')
    df = load_dataframe(filename)
    while True:
       source = raw_input('Enter source IP: ').strip()
       destination = raw_input('Enter destination IP: ').strip()
       diffs = get_diffs(df, source, destination)
       for i, row in enumerate(diffs):
           print('row %d - row %d = %s' % (i+2, i+1, row.astype('timedelta64[ms]')))
       print('Mean: %s' % diffs.mean())
       yn = raw_input('Again? [y/n]: ').lower().strip()
       if yn != 'y':
            return

if __name__ == '__main__':
    main()

Example usage:

$ python test.py
Enter filename: Data.csv
Enter source IP: 172.16.122.21
Enter destination IP: 172.55.102.107
Mean: 3333333.33333
Std: 5773502.6919
Again? [y/n]: n

Calculate Duration and Average of CSV file data using python

Question

1 answers

solution1
1 ACCPTED 2013-08-12 23:21:38

Calculate Duration and Average of CSV file data using python

Question

1 answers

solution1 1 ACCPTED 2013-08-12 23:21:38

solution1
1 ACCPTED 2013-08-12 23:21:38