EDIT:
There is a bug in pandas 1.0.5, after upgrading to 1.1.1, there is no error.
I have a pandas dataframe that looks like that:
Name Date Price Label Y Z
foo1 1/1/20 100 1 _ _
foo1 1/1/20 200 2 _ _
. . . . . .
. . . . . .
foo1 1/8/20 240 1 _ _
foo2 1/2/20 500 1 _ _
. . . . . .
. . . . . .
foo2 1/7/20 423 4 _ _
. . . . . .
. . . . . .
Name
column, ie foo1 - foo80 Date
valuesLabel
valuesI want to create a table st it will has 80 rows(each for each Name), and 20*4 + 1 columns (20x4 for each Date-Label combination, and 1 for the Name).
The final dataframe should look as follows:
**Name 1/1/20(Label1) 1/1/20(Label2) 1/1/20(Label3) 1/1/20(Label4) 1/2/20(Label1) ... 4/7/20(Label4)**
foo1 100 200 300 -1 -1 -1
foo2 -1 -1 -1 -1 500 -1
...............
...............
-1 indicates that there is not entry in the original entry for the specific Name-Date-Label combination.
I'm basically new to pandas, and I can certainly build the dataframe iteratively by hand(if..else solution) but I believe that there is a faster, readable, and easier solution.
> df.columns
Index(['A', 'B', 'Date', 'C',
'D', 'Price', 'Label', 'E',
'Name', 'F', 'G', 'H', 'I',
'J'],
dtype='object')
> df.head(10).to_dict('list')
{'A': [160, 457, 457, 482, 482, 482, 482, 423, 223, 506],
'B': ['8/27/2015 0:00',
'10/15/2015 0:00',
'10/15/2015 0:00',
'10/28/2015 0:00',
'10/28/2015 0:00',
'10/28/2015 0:00',
'10/28/2015 0:00',
'9/29/2015 0:00',
'9/9/2015 0:00',
'11/9/2015 0:00'],
'Date': ['8/28/2015 0:00',
'10/16/2015 0:00',
'10/16/2015 0:00',
'10/29/2015 0:00',
'10/29/2015 0:00',
'10/29/2015 0:00',
'10/29/2015 0:00',
'9/30/2015 0:00',
'9/10/2015 0:00',
'11/10/2015 0:00'],
'C': [5, 5, 5, 5, 5, 5, 5, 5, 5, 5],
'D': [1271, 1825, 1825, 1455, 1455, 1455, 1455, 2522, 1385, 1765],
'Price': [1058, 1685, 1615, 1195, 1255, 1279, 1295, 2285, 1285, 1665],
'Label': [3, 3, 2, 1, 3, 4, 2, 2, 1, 4],
'E': [13, 127, 127, -1, -1, -1, -1, -1, -1, -1],
'Name': ['foo1',
'foo2',
'foo2',
'foo3',
'foo3',
'foo3',
'foo3',
'foo4',
'foo4',
'foo3'],
'F': [4, 4, 4, 3, 3, 3, 3, 3, 3, 3],
'G': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
'H': ['Friday',
'Friday',
'Friday',
'Thursday',
'Thursday',
'Thursday',
'Thursday',
'Wednesday',
'Thursday',
'Tuesday'],
'I': [213, 140, 210, 260, 200, 176, 160, 237, 100, 100],
'J': [16.758457907159716,
7.671232876712329,
11.506849315068493,
17.869415807560138,
13.745704467353955,
12.096219931271474,
10.996563573883162,
9.397303727200637,
7.220216606498194,
5.6657223796034]}
Using
import pandas_datareader.data as web
import pandas as pd
import numpy as np
from datetime import datetime
df = pd.DataFrame({
# 'A': [160, 457, 457, 482, 482, 482, 482, 423, 223, 506],
# 'B': ['8/27/2015 0:00','10/15/2015 0:00','10/15/2015 0:00','10/28/2015 0:00','10/28/2015 0:00','10/28/2015 0:00','10/28/2015 0:00','9/29/2015 0:00','9/9/2015 0:00','11/9/2015 0:00'],
'Date': ['8/28/2015 0:00','10/16/2015 0:00','10/16/2015 0:00','10/29/2015 0:00','10/29/2015 0:00','10/29/2015 0:00','10/29/2015 0:00','9/30/2015 0:00','9/10/2015 0:00','11/10/2015 0:00'],
# 'C': [5, 5, 5, 5, 5, 5, 5, 5, 5, 5],
# 'D': [1271, 1825, 1825, 1455, 1455, 1455, 1455, 2522, 1385, 1765],
'Price': [1058, 1685, 1615, 1195, 1255, 1279, 1295, 2285, 1285, 1665],
'Label': [3, 3, 2, 1, 3, 4, 2, 2, 1, 4],
# 'E': [13, 127, 127, -1, -1, -1, -1, -1, -1, -1],
'Name': ['foo1','foo2','foo2','foo3','foo3','foo3','foo3','foo4','foo4','foo3'],
# 'F': [4, 4, 4, 3, 3, 3, 3, 3, 3, 3],
# 'G': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
# 'H': ['Friday','Friday','Friday','Thursday','Thursday','Thursday','Thursday','Wednesday','Thursday','Tuesday'],
# 'I': [213, 140, 210, 260, 200, 176, 160, 237, 100, 100],
# 'J': [16.758457907159716,7.671232876712329,11.506849315068493,17.869415807560138,13.745704467353955,12.096219931271474,10.996563573883162,9.397303727200637,7.220216606498194,5.6657223796034]
})
df.pivot(index='Name', columns=['Date', 'Label'], values='Price')`
I get
ValueError Traceback (most recent call last)
~\anaconda3\lib\site-packages\pandas\core\indexes\multi.py in _get_level_number(self, level)
1294 try:
-> 1295 level = self.names.index(level)
1296 except ValueError:
ValueError: 'Date' is not in list
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-17-542e5c02777d> in <module>
----> 1 df.pivot(index='Name', columns=['Date', 'Label'], values='Price')
~\anaconda3\lib\site-packages\pandas\core\frame.py in pivot(self, index, columns, values)
5921 from pandas.core.reshape.pivot import pivot
5922
-> 5923 return pivot(self, index=index, columns=columns, values=values)
5924
5925 _shared_docs[
~\anaconda3\lib\site-packages\pandas\core\reshape\pivot.py in pivot(data, index, columns, values)
448 else:
449 indexed = data._constructor_sliced(data[values].values, index=index)
--> 450 return indexed.unstack(columns)
451
452
~\anaconda3\lib\site-packages\pandas\core\series.py in unstack(self, level, fill_value)
3548 from pandas.core.reshape.reshape import unstack
3549
-> 3550 return unstack(self, level, fill_value)
3551
3552 # ----------------------------------------------------------------------
~\anaconda3\lib\site-packages\pandas\core\reshape\reshape.py in unstack(obj, level, fill_value)
396 # _unstack_multiple only handles MultiIndexes,
397 # and isn't needed for a single level
--> 398 return _unstack_multiple(obj, level, fill_value=fill_value)
399 else:
400 level = level[0]
~\anaconda3\lib\site-packages\pandas\core\reshape\reshape.py in _unstack_multiple(data, clocs, fill_value)
318 index = data.index
319
--> 320 clocs = [index._get_level_number(i) for i in clocs]
321
322 rlocs = [i for i in range(index.nlevels) if i not in clocs]
~\anaconda3\lib\site-packages\pandas\core\reshape\reshape.py in <listcomp>(.0)
318 index = data.index
319
--> 320 clocs = [index._get_level_number(i) for i in clocs]
321
322 rlocs = [i for i in range(index.nlevels) if i not in clocs]
~\anaconda3\lib\site-packages\pandas\core\indexes\multi.py in _get_level_number(self, level)
1296 except ValueError:
1297 if not is_integer(level):
-> 1298 raise KeyError(f"Level {level} not found")
1299 elif level < 0:
1300 level += self.nlevels
KeyError: 'Level Date not found'
Thanks!
You are looking for df.pivot
df = df.pivot(index='Name', columns=['Date', 'Label'], values='Price')
Warning: if any name-date-label combination is duplicated (ie appears in more than one row) then an error will be raised. Use pivot_table
or better groupby
+ unstack
If Name
, Date
, and Label
are in the index then use unstack
instead of pivot
Update with sample data
df = pd.DataFrame({
# 'A': [160, 457, 457, 482, 482, 482, 482, 423, 223, 506],
# 'B': ['8/27/2015 0:00','10/15/2015 0:00','10/15/2015 0:00','10/28/2015 0:00','10/28/2015 0:00','10/28/2015 0:00','10/28/2015 0:00','9/29/2015 0:00','9/9/2015 0:00','11/9/2015 0:00'],
'Date': ['8/28/2015 0:00','10/16/2015 0:00','10/16/2015 0:00','10/29/2015 0:00','10/29/2015 0:00','10/29/2015 0:00','10/29/2015 0:00','9/30/2015 0:00','9/10/2015 0:00','11/10/2015 0:00'],
# 'C': [5, 5, 5, 5, 5, 5, 5, 5, 5, 5],
# 'D': [1271, 1825, 1825, 1455, 1455, 1455, 1455, 2522, 1385, 1765],
'Price': [1058, 1685, 1615, 1195, 1255, 1279, 1295, 2285, 1285, 1665],
'Label': [3, 3, 2, 1, 3, 4, 2, 2, 1, 4],
# 'E': [13, 127, 127, -1, -1, -1, -1, -1, -1, -1],
'Name': ['foo1','foo2','foo2','foo3','foo3','foo3','foo3','foo4','foo4','foo3'],
# 'F': [4, 4, 4, 3, 3, 3, 3, 3, 3, 3],
# 'G': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
# 'H': ['Friday','Friday','Friday','Thursday','Thursday','Thursday','Thursday','Wednesday','Thursday','Tuesday'],
# 'I': [213, 140, 210, 260, 200, 176, 160, 237, 100, 100],
# 'J': [16.758457907159716,7.671232876712329,11.506849315068493,17.869415807560138,13.745704467353955,12.096219931271474,10.996563573883162,9.397303727200637,7.220216606498194,5.6657223796034]
})
df.Date = pd.to_datetime(df.Date)
df = df.pivot(index='Name', columns=['Date', 'Label'], values='Price')
df = df.fillna(-1)
print(df)
Output
Date 2015-08-28 2015-10-16 2015-10-29 ... 2015-09-30 2015-09-10 2015-11-10
Label 3 3 2 1 ... 2 2 1 4
Name ...
foo1 1058.0 NaN NaN NaN ... NaN NaN NaN NaN
foo2 NaN 1685.0 1615.0 NaN ... NaN NaN NaN NaN
foo3 NaN NaN NaN 1195.0 ... 1295.0 NaN NaN 1665.0
foo4 NaN NaN NaN NaN ... NaN 2285.0 1285.0 NaN
[4 rows x 10 columns]
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.