[英]Creating new pandas dataframe based on another dataframe
編輯:
pandas 1.0.5有bug,升級到1.1.1后就沒有報錯了。
我有一個 pandas dataframe 看起來像這樣:
Name Date Price Label Y Z
foo1 1/1/20 100 1 _ _
foo1 1/1/20 200 2 _ _
. . . . . .
. . . . . .
foo1 1/8/20 240 1 _ _
foo2 1/2/20 500 1 _ _
. . . . . .
. . . . . .
foo2 1/7/20 423 4 _ _
. . . . . .
. . . . . .
Name
列有 80 個唯一值,即 foo1 - foo80Date
值Label
值我想創建一個表,它有 80 行(每個名稱對應一個名稱)和 20*4 + 1 列(每個日期標簽組合為 20x4,名稱為 1)。
最終的 dataframe 應該如下所示:
**Name 1/1/20(Label1) 1/1/20(Label2) 1/1/20(Label3) 1/1/20(Label4) 1/2/20(Label1) ... 4/7/20(Label4)**
foo1 100 200 300 -1 -1 -1
foo2 -1 -1 -1 -1 500 -1
...............
...............
-1 表示原始條目中沒有特定名稱-日期-標簽組合的條目。
我基本上是 pandas 的新手,我當然可以手動迭代地構建 dataframe(if..else 解決方案),但我相信有一個更快、可讀且更簡單的解決方案。
> df.columns
Index(['A', 'B', 'Date', 'C',
'D', 'Price', 'Label', 'E',
'Name', 'F', 'G', 'H', 'I',
'J'],
dtype='object')
> df.head(10).to_dict('list')
{'A': [160, 457, 457, 482, 482, 482, 482, 423, 223, 506],
'B': ['8/27/2015 0:00',
'10/15/2015 0:00',
'10/15/2015 0:00',
'10/28/2015 0:00',
'10/28/2015 0:00',
'10/28/2015 0:00',
'10/28/2015 0:00',
'9/29/2015 0:00',
'9/9/2015 0:00',
'11/9/2015 0:00'],
'Date': ['8/28/2015 0:00',
'10/16/2015 0:00',
'10/16/2015 0:00',
'10/29/2015 0:00',
'10/29/2015 0:00',
'10/29/2015 0:00',
'10/29/2015 0:00',
'9/30/2015 0:00',
'9/10/2015 0:00',
'11/10/2015 0:00'],
'C': [5, 5, 5, 5, 5, 5, 5, 5, 5, 5],
'D': [1271, 1825, 1825, 1455, 1455, 1455, 1455, 2522, 1385, 1765],
'Price': [1058, 1685, 1615, 1195, 1255, 1279, 1295, 2285, 1285, 1665],
'Label': [3, 3, 2, 1, 3, 4, 2, 2, 1, 4],
'E': [13, 127, 127, -1, -1, -1, -1, -1, -1, -1],
'Name': ['foo1',
'foo2',
'foo2',
'foo3',
'foo3',
'foo3',
'foo3',
'foo4',
'foo4',
'foo3'],
'F': [4, 4, 4, 3, 3, 3, 3, 3, 3, 3],
'G': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
'H': ['Friday',
'Friday',
'Friday',
'Thursday',
'Thursday',
'Thursday',
'Thursday',
'Wednesday',
'Thursday',
'Tuesday'],
'I': [213, 140, 210, 260, 200, 176, 160, 237, 100, 100],
'J': [16.758457907159716,
7.671232876712329,
11.506849315068493,
17.869415807560138,
13.745704467353955,
12.096219931271474,
10.996563573883162,
9.397303727200637,
7.220216606498194,
5.6657223796034]}
使用
import pandas_datareader.data as web
import pandas as pd
import numpy as np
from datetime import datetime
df = pd.DataFrame({
# 'A': [160, 457, 457, 482, 482, 482, 482, 423, 223, 506],
# 'B': ['8/27/2015 0:00','10/15/2015 0:00','10/15/2015 0:00','10/28/2015 0:00','10/28/2015 0:00','10/28/2015 0:00','10/28/2015 0:00','9/29/2015 0:00','9/9/2015 0:00','11/9/2015 0:00'],
'Date': ['8/28/2015 0:00','10/16/2015 0:00','10/16/2015 0:00','10/29/2015 0:00','10/29/2015 0:00','10/29/2015 0:00','10/29/2015 0:00','9/30/2015 0:00','9/10/2015 0:00','11/10/2015 0:00'],
# 'C': [5, 5, 5, 5, 5, 5, 5, 5, 5, 5],
# 'D': [1271, 1825, 1825, 1455, 1455, 1455, 1455, 2522, 1385, 1765],
'Price': [1058, 1685, 1615, 1195, 1255, 1279, 1295, 2285, 1285, 1665],
'Label': [3, 3, 2, 1, 3, 4, 2, 2, 1, 4],
# 'E': [13, 127, 127, -1, -1, -1, -1, -1, -1, -1],
'Name': ['foo1','foo2','foo2','foo3','foo3','foo3','foo3','foo4','foo4','foo3'],
# 'F': [4, 4, 4, 3, 3, 3, 3, 3, 3, 3],
# 'G': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
# 'H': ['Friday','Friday','Friday','Thursday','Thursday','Thursday','Thursday','Wednesday','Thursday','Tuesday'],
# 'I': [213, 140, 210, 260, 200, 176, 160, 237, 100, 100],
# 'J': [16.758457907159716,7.671232876712329,11.506849315068493,17.869415807560138,13.745704467353955,12.096219931271474,10.996563573883162,9.397303727200637,7.220216606498194,5.6657223796034]
})
df.pivot(index='Name', columns=['Date', 'Label'], values='Price')`
我得到
ValueError Traceback (most recent call last)
~\anaconda3\lib\site-packages\pandas\core\indexes\multi.py in _get_level_number(self, level)
1294 try:
-> 1295 level = self.names.index(level)
1296 except ValueError:
ValueError: 'Date' is not in list
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-17-542e5c02777d> in <module>
----> 1 df.pivot(index='Name', columns=['Date', 'Label'], values='Price')
~\anaconda3\lib\site-packages\pandas\core\frame.py in pivot(self, index, columns, values)
5921 from pandas.core.reshape.pivot import pivot
5922
-> 5923 return pivot(self, index=index, columns=columns, values=values)
5924
5925 _shared_docs[
~\anaconda3\lib\site-packages\pandas\core\reshape\pivot.py in pivot(data, index, columns, values)
448 else:
449 indexed = data._constructor_sliced(data[values].values, index=index)
--> 450 return indexed.unstack(columns)
451
452
~\anaconda3\lib\site-packages\pandas\core\series.py in unstack(self, level, fill_value)
3548 from pandas.core.reshape.reshape import unstack
3549
-> 3550 return unstack(self, level, fill_value)
3551
3552 # ----------------------------------------------------------------------
~\anaconda3\lib\site-packages\pandas\core\reshape\reshape.py in unstack(obj, level, fill_value)
396 # _unstack_multiple only handles MultiIndexes,
397 # and isn't needed for a single level
--> 398 return _unstack_multiple(obj, level, fill_value=fill_value)
399 else:
400 level = level[0]
~\anaconda3\lib\site-packages\pandas\core\reshape\reshape.py in _unstack_multiple(data, clocs, fill_value)
318 index = data.index
319
--> 320 clocs = [index._get_level_number(i) for i in clocs]
321
322 rlocs = [i for i in range(index.nlevels) if i not in clocs]
~\anaconda3\lib\site-packages\pandas\core\reshape\reshape.py in <listcomp>(.0)
318 index = data.index
319
--> 320 clocs = [index._get_level_number(i) for i in clocs]
321
322 rlocs = [i for i in range(index.nlevels) if i not in clocs]
~\anaconda3\lib\site-packages\pandas\core\indexes\multi.py in _get_level_number(self, level)
1296 except ValueError:
1297 if not is_integer(level):
-> 1298 raise KeyError(f"Level {level} not found")
1299 elif level < 0:
1300 level += self.nlevels
KeyError: 'Level Date not found'
謝謝!
您正在尋找df.pivot
df = df.pivot(index='Name', columns=['Date', 'Label'], values='Price')
警告:如果任何名稱-日期-標簽組合重復(即出現在多行中),則會引發錯誤。 使用pivot_table
或更好的groupby
+ unstack
如果Name
、 Date
和Label
在索引中,則使用unstack
而不是pivot
使用樣本數據更新
df = pd.DataFrame({
# 'A': [160, 457, 457, 482, 482, 482, 482, 423, 223, 506],
# 'B': ['8/27/2015 0:00','10/15/2015 0:00','10/15/2015 0:00','10/28/2015 0:00','10/28/2015 0:00','10/28/2015 0:00','10/28/2015 0:00','9/29/2015 0:00','9/9/2015 0:00','11/9/2015 0:00'],
'Date': ['8/28/2015 0:00','10/16/2015 0:00','10/16/2015 0:00','10/29/2015 0:00','10/29/2015 0:00','10/29/2015 0:00','10/29/2015 0:00','9/30/2015 0:00','9/10/2015 0:00','11/10/2015 0:00'],
# 'C': [5, 5, 5, 5, 5, 5, 5, 5, 5, 5],
# 'D': [1271, 1825, 1825, 1455, 1455, 1455, 1455, 2522, 1385, 1765],
'Price': [1058, 1685, 1615, 1195, 1255, 1279, 1295, 2285, 1285, 1665],
'Label': [3, 3, 2, 1, 3, 4, 2, 2, 1, 4],
# 'E': [13, 127, 127, -1, -1, -1, -1, -1, -1, -1],
'Name': ['foo1','foo2','foo2','foo3','foo3','foo3','foo3','foo4','foo4','foo3'],
# 'F': [4, 4, 4, 3, 3, 3, 3, 3, 3, 3],
# 'G': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
# 'H': ['Friday','Friday','Friday','Thursday','Thursday','Thursday','Thursday','Wednesday','Thursday','Tuesday'],
# 'I': [213, 140, 210, 260, 200, 176, 160, 237, 100, 100],
# 'J': [16.758457907159716,7.671232876712329,11.506849315068493,17.869415807560138,13.745704467353955,12.096219931271474,10.996563573883162,9.397303727200637,7.220216606498194,5.6657223796034]
})
df.Date = pd.to_datetime(df.Date)
df = df.pivot(index='Name', columns=['Date', 'Label'], values='Price')
df = df.fillna(-1)
print(df)
Output
Date 2015-08-28 2015-10-16 2015-10-29 ... 2015-09-30 2015-09-10 2015-11-10
Label 3 3 2 1 ... 2 2 1 4
Name ...
foo1 1058.0 NaN NaN NaN ... NaN NaN NaN NaN
foo2 NaN 1685.0 1615.0 NaN ... NaN NaN NaN NaN
foo3 NaN NaN NaN 1195.0 ... 1295.0 NaN NaN 1665.0
foo4 NaN NaN NaN NaN ... NaN 2285.0 1285.0 NaN
[4 rows x 10 columns]
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.