[英]Plot graph with multiple attributes similar to “hue” in Seaborn
我有以下示例數據集,名為df
,其中舞台時間是到達df
天數:
id stage1_time stage_1_to_2_time stage_2_time stage_2_to_3_time stage3_time
a 10 30 40 30 70
b 30
c 15 30 45
d
我編寫了以下腳本來獲取stage1_time
對CDF的散點圖:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
dict = {'id': id, 'stage_1_time': [10, 30, 15, None], 'stage_1_to_2_time': [30, None, 30, None], 'stage_2_time' : [40, None, 45, None],'stage_2_to_3_time' : [30, None, None, None],'stage_3_time' : [70, None, None, None]}
df = pd.DataFrame(dict)
#create eCDF function
def ecdf(df):
n = len(df)
x = np.sort(df)
y = np.arange(1.0, n+1) / n
return x, y
def generate_scatter_plot(df):
x, y = ecdf(df)
plt.plot(x, y, marker='.', linestyle='none')
plt.axvline(x.mean(), color='gray', linestyle='dashed', linewidth=2) #Add mean
x_m = int(x.mean())
y_m = stats.percentileofscore(df.as_matrix(), x.mean())/100.0
plt.annotate('(%s,%s)' % (x_m,int(y_m*100)) , xy=(x_m,y_m), xytext=(10,-5), textcoords='offset points')
percentiles= np.array([0,25,50,75,100])
x_p = np.percentile(df, percentiles)
y_p = percentiles/100.0
plt.plot(x_p, y_p, marker='D', color='red', linestyle='none') # Overlay quartiles
for x,y in zip(x_p, y_p):
plt.annotate('%s' % int(x), xy=(x,y), xytext=(10,-5), textcoords='offset points')
#Data to plot
stage1_time = df['stage_1_time'].dropna().sort_values()
#Scatter Plot
stage1_time_scatter = generate_scatter_plot(pd.DataFrame({"df" : stage1_time.as_matrix()}))
plt.title('Scatter Plot of Days to Stage1')
plt.xlabel('Days to Stage1')
plt.ylabel('Cumulative Probability')
plt.legend(('Days to Stage1', "Mean", 'Quartiles'), loc='lower right')
plt.margins(0.02)
plt.show()
輸出:
目前,所有達到stage1
根據其累積概率繪制的天數,但是我想要實現的是當我繪制時散布有三種顏色:那些到達stage1
並留在那里的人,那些進入stage2
和那些誰轉移到stage3
。 我也想在圖表中的數據計數:#在stage1
中,# stage2
中和# stage3
。
有人可以幫忙到那里嗎?
僅供參考,意圖是使用它作為基礎,這樣我也可以為stage2_time
創建一個圖形,其中到達stage_3
的圖形突出顯示不同的顏色。
您可以創建一個新列並使用它來存儲最終階段,然后使用此新列為您的繪圖着色。
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import math
dict = {'id': id, 'Progressive_time': [10, 30, 15, None],'stage_1_to_2_time': [30, None, 30, None], 'Active_time' : [40,None, 45, None],'stage_2_to_3_time' : [30, None, None,None],'Engaged_time' : [70, None, None, None]}
df = pd.DataFrame(dict)
#create eCDF function
def ecdf(df, serie):
n = len(df)
df['x'] = np.sort(df[serie])
df['y'] = np.arange(1.0, n+1) / n
return df
def generate_scatter_plot(df,serie,nb_stage):
df=df.dropna(subset=[serie]).sort_values(by=[serie])
st=1
for i in range(1,nb_stage*2,2):
df.loc[df.iloc[:,i].notnull(),'stage']=st
st=st+1
df= ecdf(df, serie)
plt.plot(df.loc[df['stage'] == 1, 'x'], df.loc[df['stage'] == 1, 'y'], marker='.', linestyle='none',c='blue')
plt.plot(df.loc[df['stage'] == 2, 'x'], df.loc[df['stage'] == 2, 'y'], marker='.', linestyle='none',c='red')
plt.plot(df.loc[df['stage'] == 3, 'x'], df.loc[df['stage'] == 3, 'y'], marker='.', linestyle='none',c='green')
plt.axvline(df['x'].mean(), color='gray', linestyle='dashed', linewidth=2) #Add mean
x_m = int(df['x'].mean())
y_m = stats.percentileofscore(df[serie], df['x'].mean())/100.0
plt.annotate('(%s,%s)' % (x_m,int(y_m*100)) , xy=(x_m,y_m), xytext=(10,-5), textcoords='offset points')
percentiles= np.array([0,25,50,75,100])
x_p = np.percentile(df[serie], percentiles)
y_p = percentiles/100.0
plt.plot(x_p, y_p, marker='D', color='red', linestyle='none') # Overlay quartiles
for x,y in zip(x_p, y_p):
plt.annotate('%s' % int(x), xy=(x,y), xytext=(10,-5), textcoords='offset points')
#Scatter Plot
stage1_time_scatter = generate_scatter_plot(df,'stage_1_time',3)
plt.title('Scatter Plot of Days to Stage1')
plt.xlabel('Days to Stage1')
plt.ylabel('Cumulative Probability')
plt.legend(('Progressive','Active','Engaged','Days to Stage1', "Mean", 'Quartiles'), loc='lower right')
plt.margins(0.02)
plt.show()
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.