I am fairly new to Python Pandas and I have problems getting the Pandas GroupBy
combined with transform
to act the way I want to. I have been unable to find an answer posted already, but I may have missed something.
I have a DataFrame with a large number of entries, structured like follows:
GLT_City = pd.read_csv('GlobalLandTemperaturesByCity.csv', sep=',')
GLT_City.head()
AvgTemp AvgTempUncert City Country Lat Long year month day
0 6.068 1.737 Århus Denmark 57.05N 10.33E 1743 11 01
5 5.788 3.624 Århus Denmark 57.05N 10.33E 1744 04 01
6 10.644 1.283 Århus Denmark 57.05N 10.33E 1744 05 01
7 14.051 1.347 Århus Denmark 57.05N 10.33E 1744 06 01
8 16.082 1.396 Århus Denmark 57.05N 10.33E 1744 07 01
10 12.781 1.454 Århus Denmark 57.05N 10.33E 1744 09 01
11 7.950 1.630 Århus Denmark 57.05N 10.33E 1744 10 01
12 4.639 1.302 Århus Denmark 57.05N 10.33E 1744 11 01
I want to compute the weighted average temperature for each city, for each month, and add this as a new column to my original data frame in the smoothest possible way, using transform()
, for reasons further down the line.
First, I define a function to compute a weighted average:
def wavg(group,data_name,weight_name, sigma=None):
data = group[data_name]
weight = group[weight_name]
#Check whether we have actual weights or measurement uncertainties
if sigma=='sigma':
weight = 1./weight
try:
return (data * weight).sum() / weight.sum()
except ZeroDivisionError:
return data.mean()
I then want to combine GroupBy
and transform()
to apply this function to my data frame and add the result as a new column, like:
GLT_City['WeightedMonthlyMean'] = GLT_City.groupby(['City','month']).transform(wavg, 'AvgTemp','AvgTempUncert', sigma='sigma')
Now this results in a very lengthy error message copy-pasted below
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5126)()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item (pandas/_libs/hashtable.c:14010)()
TypeError: an integer is required
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-61-cef679f52b5f> in <module>()
----> 1 GLT_City['WeightedMonthlyMean'] = GLT_City.groupby(['City','month']).transform(wavg,
'AvgTemp','AvgTemp', sigma='sigma')
~/anaconda/envs/python36/lib/python3.6/site-
packages/pandas/core/groupby.py in transform(self, func, *args, **kwargs)
3814 result = getattr(self, func)(*args, **kwargs)
3815 else:
-> 3816 return self._transform_general(func, *args, **kwargs)
3817
3818 # a reduction transform
~/anaconda/envs/python36/lib/python3.6/site-packages/pandas/core/groupby.py in _transform_general(self, func, *args, **kwargs)
3765 # Try slow path and fast path.
3766 try:
-> 3767 path, res = self._choose_path(fast_path, slow_path, group)
3768 except TypeError:
3769 return self._transform_item_by_item(obj, fast_path)
~/anaconda/envs/python36/lib/python3.6/site-packages/pandas/core/groupby.py in _choose_path(self, fast_path, slow_path, group)
3861 def _choose_path(self, fast_path, slow_path, group):
3862 path = slow_path
-> 3863 res = slow_path(group)
3864
3865 # if we make it here, test if we can use the fast path
~/anaconda/envs/python36/lib/python3.6/site-packages/pandas/core/groupby.py in <lambda>(group)
3856 fast_path = lambda group: func(group, *args, **kwargs)
3857 slow_path = lambda group: group.apply(
-> 3858 lambda x: func(x, *args, **kwargs), axis=self.axis)
3859 return fast_path, slow_path
3860
~/anaconda/envs/python36/lib/python3.6/site-packages/pandas/core/frame.py in apply(self, func, axis, broadcast, raw, reduce, args, **kwds)
4260 f, axis,
4261 reduce=reduce,
-> 4262 ignore_failures=ignore_failures)
4263 else:
4264 return self._apply_broadcast(f, axis)
~/anaconda/envs/python36/lib/python3.6/site-packages/pandas/core/frame.py in _apply_standard(self, func, axis, ignore_failures, reduce)
4356 try:
4357 for i, v in enumerate(series_gen):
-> 4358 results[i] = func(v)
4359 keys.append(v.name)
4360 except Exception as e:
~/anaconda/envs/python36/lib/python3.6/site-packages/pandas/core/groupby.py in <lambda>(x)
3856 fast_path = lambda group: func(group, *args, **kwargs)
3857 slow_path = lambda group: group.apply(
-> 3858 lambda x: func(x, *args, **kwargs), axis=self.axis)
3859 return fast_path, slow_path
3860
<ipython-input-58-181ef4bb1f30> in wavg(group, data_name, weight_name, sigma)
10
11 #Extracting data and weights.
---> 12 data = group[data_name]
13 weight = group[weight_name]
14 #Check whether we have actual weights, or measurement uncertainties
~/anaconda/envs/python36/lib/python3.6/site-packages/pandas/core/series.py in __getitem__(self, key)
599 key = com._apply_if_callable(key, self)
600 try:
--> 601 result = self.index.get_value(self, key)
602
603 if not is_scalar(result):
~/anaconda/envs/python36/lib/python3.6/site-
packages/pandas/core/indexes/base.py in get_value(self, series, key)
2475 try:
2476 return self._engine.get_value(s, k,
-> 2477
tz=getattr(series.dtype, 'tz', None))
2478 except KeyError as e1:
2479 if len(self) > 0 and self.inferred_type in ['integer', 'boolean']:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value (pandas/_libs/index.c:4404)()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value (pandas/_libs/index.c:4087)()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5210)()
KeyError: ('AvgTemp', 'occurred at index AvgTemp')
So this is obviously not working, but it's not clear to me why. Any pointers/solutions would be most welcome.
I can use the apply()
method, to get the desired output, but since I am averaging over groups, I can't really merge this with the original dataframe, since the series produced by apply()
will be of a different size.
The transform
function is applied to each group column individually. Putting a print
statement inside wavg
will help you see the problem:
def wavg(group,data_name,weight_name, sigma=None):
print(group)
...
df['WeightedMonthlyMean'] = df.groupby(['City','month']).transform(wavg, 'AvgTemp','AvgTempUncert', sigma='sigma')
prints
1 5.788
Name: AvgTemp, dtype: object
before raising the KeyError
. This shows that group
is merely a Series, not the whole (group) DataFrame.
So instead, use apply
, then merge the result
back into df
:
result = df.groupby(['City','month']).apply(wavg, 'AvgTemp','AvgTempUncert', sigma='sigma').reset_index(name='wavg')
result = pd.merge(df, result)
For example,
import pandas as pd
df = pd.DataFrame({'AvgTemp': [6.068, 5.787999999999999, 10.644, 14.050999999999998, 16.082, 12.780999999999999, 7.95, 4.638999999999999], 'AvgTempUncert': [1.7369999999999999, 3.6239999999999997, 1.2830000000000001, 1.347, 1.396, 1.454, 1.63, 1.3019999999999998], 'City': ['Århus', 'Århus', 'Århus', 'Århus', 'Århus', 'Århus', 'Århus', 'Århus'], 'Country': ['Denmark', 'Denmark', 'Denmark', 'Denmark', 'Denmark', 'Denmark', 'Denmark', 'Denmark'], 'Lat': ['57.05N', '57.05N', '57.05N', '57.05N', '57.05N', '57.05N', '57.05N', '57.05N'], 'Long': ['10.33E', '10.33E', '10.33E', '10.33E', '10.33E', '10.33E', '10.33E', '10.33E'], 'day': [1, 1, 1, 1, 1, 1, 1, 1], 'month': [11, 4, 5, 6, 7, 9, 10, 11], 'year': [1743, 1744, 1744, 1744, 1744, 1744, 1744, 1744]})
def wavg(group,data_name,weight_name, sigma=None):
data = group[data_name]
weight = group[weight_name]
#Check whether we have actual weights or measurement uncertainties
if sigma=='sigma':
weight = 1./weight
try:
return (data * weight).sum() / weight.sum()
except ZeroDivisionError:
return data.mean()
result = df.groupby(['City','month']).apply(wavg, 'AvgTemp','AvgTempUncert', sigma='sigma').reset_index(name='wavg')
result = pd.merge(df, result)
print(result)
yields
AvgTemp AvgTempUncert City Country Lat Long day month year wavg
0 6.068 1.737 Århus Denmark 57.05N 10.33E 1 11 1743 5.251227
1 4.639 1.302 Århus Denmark 57.05N 10.33E 1 11 1744 5.251227
2 5.788 3.624 Århus Denmark 57.05N 10.33E 1 4 1744 5.788000
3 10.644 1.283 Århus Denmark 57.05N 10.33E 1 5 1744 10.644000
4 14.051 1.347 Århus Denmark 57.05N 10.33E 1 6 1744 14.051000
5 16.082 1.396 Århus Denmark 57.05N 10.33E 1 7 1744 16.082000
6 12.781 1.454 Århus Denmark 57.05N 10.33E 1 9 1744 12.781000
7 7.950 1.630 Århus Denmark 57.05N 10.33E 1 10 1744 7.950000
How about using apply
and then merge
to get this into the same DataFrame
? Example:
import numpy as np
import pandas as pd
data = pd.DataFrame({'City': np.random.randint(0, 4, 1000), 'Month': np.random.randint(1, 12, 1000), 'T': np.random.randn(1000)})
pd.merge(data, data.groupby(['City', 'Month']).apply(lambda x: x['T']*2).reset_index()[['City', 'Month', 'T']].rename(columns={'T': 'WeightedT'}), left_on=['City', 'Month'], right_on=['City', 'Month'])
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.