在Seaborn中绘制具有与“hue”类似的多个属性的图表

Question

I have the following sample data set called df , where stage time is how many days to get there: 我有以下示例数据集，名为df ，其中舞台时间是到达df天数：

id stage1_time stage_1_to_2_time stage_2_time stage_2_to_3_time stage3_time
a  10          30                40           30                70
b  30               
c  15          30                45     
d

I wrote the following script to get a scatter plot of stage1_time against a CDF: 我编写了以下脚本来获取stage1_time对CDF的散点图：

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats

dict = {'id': id, 'stage_1_time': [10, 30, 15, None], 'stage_1_to_2_time': [30, None, 30, None], 'stage_2_time' : [40, None, 45, None],'stage_2_to_3_time' : [30, None, None, None],'stage_3_time' : [70, None, None, None]}
df = pd.DataFrame(dict)

#create eCDF function
def ecdf(df):
    n = len(df)
    x = np.sort(df)
    y = np.arange(1.0, n+1) / n
    return x, y

def generate_scatter_plot(df):

    x, y = ecdf(df)

    plt.plot(x, y, marker='.', linestyle='none') 
    plt.axvline(x.mean(), color='gray', linestyle='dashed', linewidth=2) #Add mean

    x_m = int(x.mean())
    y_m = stats.percentileofscore(df.as_matrix(), x.mean())/100.0

    plt.annotate('(%s,%s)' % (x_m,int(y_m*100)) , xy=(x_m,y_m), xytext=(10,-5), textcoords='offset points')

    percentiles= np.array([0,25,50,75,100])
    x_p = np.percentile(df, percentiles)
    y_p = percentiles/100.0

    plt.plot(x_p, y_p, marker='D', color='red', linestyle='none') # Overlay quartiles

    for x,y in zip(x_p, y_p):                                        
        plt.annotate('%s' % int(x), xy=(x,y), xytext=(10,-5), textcoords='offset points')

#Data to plot
stage1_time = df['stage_1_time'].dropna().sort_values()

#Scatter Plot
stage1_time_scatter = generate_scatter_plot(pd.DataFrame({"df" : stage1_time.as_matrix()}))
plt.title('Scatter Plot of Days to Stage1')
plt.xlabel('Days to Stage1')
plt.ylabel('Cumulative Probability')
plt.legend(('Days to Stage1', "Mean", 'Quartiles'), loc='lower right')
plt.margins(0.02)

plt.show()

Output: 输出：

Currently I have number of days it took all who reached stage1 plotted against its cumulative probability, however what I am trying to achieve is that the scatter has three colors when I plot: those who reached stage1 and stayed there, those who moved on to stage2 , and those who moved on to stage3 . 目前，所有达到stage1根据其累积概率绘制的天数，但是我想要实现的是当我绘制时散布有三种颜色：那些到达stage1并留在那里的人，那些进入stage2和那些谁转移到stage3 。 I would also like the counts for the data in the graph: # in stage1 , # in stage2 and # in stage3 . 我也想在图表中的数据计数：＃在stage1中，＃ stage2中和＃ stage3 。

Can anyone assist with getting there please? 有人可以帮忙到那里吗？

FYI, intention is to use this as a base so that I can also create a graph for stage2_time , where those reaching stage_3 are highlighted a different color. 仅供参考，意图是使用它作为基础，这样我也可以为stage2_time创建一个图形，其中到达stage_3的图形突出显示不同的颜色。

Answer 1

You can create a new column and use it to store the final stage, then use this new column to color your plot. 您可以创建一个新列并使用它来存储最终阶段，然后使用此新列为您的绘图着色。

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import math

dict = {'id': id, 'Progressive_time': [10, 30, 15, None],'stage_1_to_2_time': [30, None, 30, None], 'Active_time' : [40,None, 45, None],'stage_2_to_3_time' : [30, None, None,None],'Engaged_time' : [70, None, None, None]}
df = pd.DataFrame(dict)

    #create eCDF function
def ecdf(df, serie):
    n = len(df)
    df['x'] = np.sort(df[serie])
    df['y'] = np.arange(1.0, n+1) / n
    return df

def generate_scatter_plot(df,serie,nb_stage):
    df=df.dropna(subset=[serie]).sort_values(by=[serie])
    st=1
    for i in range(1,nb_stage*2,2):
        df.loc[df.iloc[:,i].notnull(),'stage']=st
        st=st+1

    df= ecdf(df, serie)
    plt.plot(df.loc[df['stage'] == 1, 'x'], df.loc[df['stage'] == 1, 'y'], marker='.', linestyle='none',c='blue') 
    plt.plot(df.loc[df['stage'] == 2, 'x'], df.loc[df['stage'] == 2, 'y'], marker='.', linestyle='none',c='red') 
    plt.plot(df.loc[df['stage'] == 3, 'x'], df.loc[df['stage'] == 3, 'y'], marker='.', linestyle='none',c='green') 
    plt.axvline(df['x'].mean(), color='gray', linestyle='dashed', linewidth=2) #Add mean


    x_m = int(df['x'].mean())
    y_m = stats.percentileofscore(df[serie], df['x'].mean())/100.0

    plt.annotate('(%s,%s)' % (x_m,int(y_m*100)) , xy=(x_m,y_m), xytext=(10,-5), textcoords='offset points')

    percentiles= np.array([0,25,50,75,100])
    x_p = np.percentile(df[serie], percentiles)
    y_p = percentiles/100.0

    plt.plot(x_p, y_p, marker='D', color='red', linestyle='none') # Overlay quartiles

    for x,y in zip(x_p, y_p):                                        
        plt.annotate('%s' % int(x), xy=(x,y), xytext=(10,-5), textcoords='offset points')

#Scatter Plot
stage1_time_scatter = generate_scatter_plot(df,'stage_1_time',3)
plt.title('Scatter Plot of Days to Stage1')
plt.xlabel('Days to Stage1')
plt.ylabel('Cumulative Probability')
plt.legend(('Progressive','Active','Engaged','Days to Stage1', "Mean", 'Quartiles'), loc='lower right')
plt.margins(0.02)

plt.show()

在Seaborn中绘制具有与“hue”类似的多个属性的图表

问题描述

1 个解决方案

解决方案1
5 已采纳 2018-06-04 12:22:50

在Seaborn中绘制具有与“hue”类似的多个属性的图表

问题描述

1 个解决方案

解决方案1 5 已采纳 2018-06-04 12:22:50

解决方案1
5 已采纳 2018-06-04 12:22:50