[英]Pandas pivot table sorting with multiple indexes
輸入代碼:
import pandas as pd
import numpy as np
#Dummy df:
df = pd.DataFrame({'Name': ['John', 'Boby', 'Mina', 'Peter',
'Nicky','Peter','Mina','Peter'],
'City': ['London','NY','LA','London','NY','HK','NY','HK'],
'Stage': ['Masters', 'Graduate', 'Graduate', 'Masters',
'Graduate','Masters','Graduate','Graduate'],
'Year':[2020,2019,2020,2019,2020,2019,2020,2020],
'Month':[202001,201902,202003,201904,202005,201902,202007,202012],
'Earnings': [27, 23, 21, 66, 24,22,34,65]})
df_pivot=pd.pivot_table(df,values = 'Earnings', index=
['Name','City','Stage'], columns = ['Year','Month'], aggfunc=np.sum,
fill_value=0, margins = True).sort_values('All', ascending=False)
print(df_pivot)
Output pivot表:
Year 2019 2020
All
Month 201902 201904 202001 202003 202005 202007 202012
Name City Stage
All 45 66 27 21 24 34 65 282
Peter London Masters 0 66 0 0 0 0 0 66
HK Graduate 0 0 0 0 0 0 65 65
Mina NY Graduate 0 0 0 0 0 34 0 34
John London Masters 0 0 27 0 0 0 0 27
Nicky NY Graduate 0 0 0 0 24 0 0 24
Boby NY Graduate 23 0 0 0 0 0 0 23
Peter HK Masters 22 0 0 0 0 0 0 22
Mina LA Graduate 0 0 0 21 0 0 0 21
所需的 output 首先按第一列排序,然后在組內按第二列排序,最后在組內按第三列排序:
Year 2019 2020 All
Month 201902 201904 202001 202003 202005 202007 202012
Name City Stage
All 45 66 27 21 24 34 65 282
Peter HK Graduate 0 0 0 0 0 0 65 65
Masters 22 0 0 0 0 0 0 22
London Masters 0 66 0 0 0 0 0 66
Mina NY Graduate 0 0 0 0 0 34 0 34
LA Graduate 0 0 0 21 0 0 0 21
John London Masters 0 0 27 0 0 0 0 27
Nicky NY Graduate 0 0 0 0 24 0 0 24
Boby NY Graduate 23 0 0 0 0 0 0 23
請注意 Peter-HK 如何高於 Peter-London,因為 Peter-HK 的總和 (65+22) > Peter-London 的總和 (66)。
換句話說:首先給我最大總數的名稱,然后在該名稱中給我最大總數的城市,然后在該名稱和城市中給我最大總數的階段。
謝謝帕維爾
在更好地理解問題后進行編輯。
您想對一個人(由Name
定義)獲得的最高分數進行排序。 然后在該人中,您要對該人獲得的個人分數進行排序。
在您的示例中,我可以通過以下方式獲取具有所需Name
序列的列表:
import pandas as pd
import numpy as np
#Dummy df:
df = pd.DataFrame({'Name': ['John', 'Boby', 'Mina', 'Peter',
'Nicky','Peter','Mina','Peter'],
'City': ['London','NY','LA','London','NY','HK','NY','HK'],
'Stage': ['Masters', 'Graduate', 'Graduate', 'Masters',
'Graduate','Masters','Graduate','Graduate'],
'Year':[2020,2019,2020,2019,2020,2019,2020,2020],
'Month':[202001,201902,202003,201904,202005,201902,202007,202012],
'Earnings': [27, 23, 21, 23, 24,22,34,65]})
# Make the pivot table
df_pivot=pd.pivot_table(df,values = 'Earnings', index=
['Name','City','Stage'], columns = ['Year','Month'], aggfunc=np.sum,
fill_value=0, margins = True).sort_values('All', ascending=False)
print('Original table')
print(df_pivot)
def sort_groups(df, group_by_col, sort_by_col, F_asc):
"""Sort a dataframe by a certain level of the MultiIndex
Args:
df (pd.DataFrame): Dataframe to sort
group_by_col (str): name of the index level to sort by
sort_by_col (str): name of the value column to sort by
F_asc (bool): Ascending sort - True/False
Returns:
pd.Dataframe: Dataframe sorted on given multiindex level
"""
# Make a list of the desired index sequence based on the max value found in each group
ind = df.groupby(by=group_by_col).max().sort_values(sort_by_col, ascending=F_asc).index.to_list()
# Return re-indexed dataframe
return df.reindex(ind, level=df.index.names.index(group_by_col))
# First level sorting: Name
df_pivot_1 = sort_groups(df_pivot, 'Name', 'All', False)
print('\nSort groups at name level:')
print(df_pivot_1)
# Second level sorting : City
#df_pivot_2 = df_pivot_1.groupby(by='Name').apply(lambda x : sort_groups(x, 'City', 'All', False))
df_pivot_2 =pd.concat([sort_groups(group, 'City', 'All', False) for index, group in df_pivot_1.groupby(by=['Name'])])
print('\nSort groups at city level:')
print(df_pivot_2)
# Third level sorting : Stage
df_pivot_3 = df_pivot_2.groupby(by = ['Name', 'City']).apply(lambda x : sort_groups(x, 'Stage', 'All', False))
print('\nSort groups at stage level:')
print(df_pivot_3)
但是,此解決方案不會將All
行放置在您指定的位置。 這對你來說很嚴格嗎?
問候,
簡
這是一種將 groupby 與 pivot 結合起來的超級干凈的方法
df = pd.DataFrame({'Name': ['John', 'Boby', 'Mina', 'Peter',
'Nicky','Peter','Mina','Peter'],
'City': ['London','NY','LA','London','NY','HK','NY','HK'],
'Stage': ['Masters', 'Graduate', 'Graduate', 'Masters',
'Graduate','Masters','Graduate','Graduate'],
'Year':[2020,2019,2020,2019,2020,2019,2020,2020],
'Month':[202001,201902,202003,201904,202005,201902,202007,202012],
'Earnings': [27, 23, 21, 23, 24,22,34,65]})
grouped=df.groupby(['Name','City','Stage','Year','Month'])['Earnings'].sum()
#print(grouped)
grouped=grouped.reset_index(name='Sum')
fp=grouped.pivot(index=['Name','City','Stage'],columns=['Year','Month'],values='Sum').fillna(0)
fp['Totals'] = fp.sum(axis='columns')
fp["Rank"] = fp.groupby(['Name','City'])['Totals'].sum()
fp = fp.sort_values(by=['Name','Rank','City','Totals'],ascending=[False,False,False,False])
print(fp)
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.