简体   繁体   中英

Plot graph with multiple attributes similar to “hue” in Seaborn

I have the following sample data set called df , where stage time is how many days to get there:

id stage1_time stage_1_to_2_time stage_2_time stage_2_to_3_time stage3_time
a  10          30                40           30                70
b  30               
c  15          30                45     
d       

I wrote the following script to get a scatter plot of stage1_time against a CDF:

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats

dict = {'id': id, 'stage_1_time': [10, 30, 15, None], 'stage_1_to_2_time': [30, None, 30, None], 'stage_2_time' : [40, None, 45, None],'stage_2_to_3_time' : [30, None, None, None],'stage_3_time' : [70, None, None, None]}
df = pd.DataFrame(dict)

#create eCDF function
def ecdf(df):
    n = len(df)
    x = np.sort(df)
    y = np.arange(1.0, n+1) / n
    return x, y

def generate_scatter_plot(df):

    x, y = ecdf(df)

    plt.plot(x, y, marker='.', linestyle='none') 
    plt.axvline(x.mean(), color='gray', linestyle='dashed', linewidth=2) #Add mean

    x_m = int(x.mean())
    y_m = stats.percentileofscore(df.as_matrix(), x.mean())/100.0

    plt.annotate('(%s,%s)' % (x_m,int(y_m*100)) , xy=(x_m,y_m), xytext=(10,-5), textcoords='offset points')

    percentiles= np.array([0,25,50,75,100])
    x_p = np.percentile(df, percentiles)
    y_p = percentiles/100.0

    plt.plot(x_p, y_p, marker='D', color='red', linestyle='none') # Overlay quartiles

    for x,y in zip(x_p, y_p):                                        
        plt.annotate('%s' % int(x), xy=(x,y), xytext=(10,-5), textcoords='offset points')

#Data to plot
stage1_time = df['stage_1_time'].dropna().sort_values()

#Scatter Plot
stage1_time_scatter = generate_scatter_plot(pd.DataFrame({"df" : stage1_time.as_matrix()}))
plt.title('Scatter Plot of Days to Stage1')
plt.xlabel('Days to Stage1')
plt.ylabel('Cumulative Probability')
plt.legend(('Days to Stage1', "Mean", 'Quartiles'), loc='lower right')
plt.margins(0.02)

plt.show()

Output:

在此输入图像描述

Currently I have number of days it took all who reached stage1 plotted against its cumulative probability, however what I am trying to achieve is that the scatter has three colors when I plot: those who reached stage1 and stayed there, those who moved on to stage2 , and those who moved on to stage3 . I would also like the counts for the data in the graph: # in stage1 , # in stage2 and # in stage3 .

Can anyone assist with getting there please?

FYI, intention is to use this as a base so that I can also create a graph for stage2_time , where those reaching stage_3 are highlighted a different color.

You can create a new column and use it to store the final stage, then use this new column to color your plot.

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import math

dict = {'id': id, 'Progressive_time': [10, 30, 15, None],'stage_1_to_2_time': [30, None, 30, None], 'Active_time' : [40,None, 45, None],'stage_2_to_3_time' : [30, None, None,None],'Engaged_time' : [70, None, None, None]}
df = pd.DataFrame(dict)

    #create eCDF function
def ecdf(df, serie):
    n = len(df)
    df['x'] = np.sort(df[serie])
    df['y'] = np.arange(1.0, n+1) / n
    return df

def generate_scatter_plot(df,serie,nb_stage):
    df=df.dropna(subset=[serie]).sort_values(by=[serie])
    st=1
    for i in range(1,nb_stage*2,2):
        df.loc[df.iloc[:,i].notnull(),'stage']=st
        st=st+1

    df= ecdf(df, serie)
    plt.plot(df.loc[df['stage'] == 1, 'x'], df.loc[df['stage'] == 1, 'y'], marker='.', linestyle='none',c='blue') 
    plt.plot(df.loc[df['stage'] == 2, 'x'], df.loc[df['stage'] == 2, 'y'], marker='.', linestyle='none',c='red') 
    plt.plot(df.loc[df['stage'] == 3, 'x'], df.loc[df['stage'] == 3, 'y'], marker='.', linestyle='none',c='green') 
    plt.axvline(df['x'].mean(), color='gray', linestyle='dashed', linewidth=2) #Add mean


    x_m = int(df['x'].mean())
    y_m = stats.percentileofscore(df[serie], df['x'].mean())/100.0

    plt.annotate('(%s,%s)' % (x_m,int(y_m*100)) , xy=(x_m,y_m), xytext=(10,-5), textcoords='offset points')

    percentiles= np.array([0,25,50,75,100])
    x_p = np.percentile(df[serie], percentiles)
    y_p = percentiles/100.0

    plt.plot(x_p, y_p, marker='D', color='red', linestyle='none') # Overlay quartiles

    for x,y in zip(x_p, y_p):                                        
        plt.annotate('%s' % int(x), xy=(x,y), xytext=(10,-5), textcoords='offset points')

#Scatter Plot
stage1_time_scatter = generate_scatter_plot(df,'stage_1_time',3)
plt.title('Scatter Plot of Days to Stage1')
plt.xlabel('Days to Stage1')
plt.ylabel('Cumulative Probability')
plt.legend(('Progressive','Active','Engaged','Days to Stage1', "Mean", 'Quartiles'), loc='lower right')
plt.margins(0.02)

plt.show()

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM