简体   繁体   中英

Combining Pandas GroupBy and transform with multiindex dataframes

I am fairly new to Python Pandas and I have problems getting the Pandas GroupBy combined with transform to act the way I want to. I have been unable to find an answer posted already, but I may have missed something.

I have a DataFrame with a large number of entries, structured like follows:

GLT_City = pd.read_csv('GlobalLandTemperaturesByCity.csv', sep=',')
GLT_City.head()

   AvgTemp  AvgTempUncert   City    Country Lat     Long    year    month   day
0   6.068   1.737          Århus    Denmark 57.05N  10.33E  1743    11  01
5   5.788   3.624          Århus    Denmark 57.05N  10.33E  1744    04  01
6   10.644  1.283          Århus    Denmark 57.05N  10.33E  1744    05  01
7   14.051  1.347          Århus    Denmark 57.05N  10.33E  1744    06  01
8   16.082  1.396          Århus    Denmark 57.05N  10.33E  1744    07  01
10  12.781  1.454          Århus    Denmark 57.05N  10.33E  1744    09  01
11  7.950   1.630          Århus    Denmark 57.05N  10.33E  1744    10  01
12  4.639   1.302          Århus    Denmark 57.05N  10.33E  1744    11  01

I want to compute the weighted average temperature for each city, for each month, and add this as a new column to my original data frame in the smoothest possible way, using transform() , for reasons further down the line.

First, I define a function to compute a weighted average:

def wavg(group,data_name,weight_name, sigma=None):
    data = group[data_name]
    weight = group[weight_name]
    #Check whether we have actual weights or measurement uncertainties
    if sigma=='sigma':
        weight = 1./weight

    try:
        return (data * weight).sum() / weight.sum()
    except ZeroDivisionError:
        return data.mean()

I then want to combine GroupBy and transform() to apply this function to my data frame and add the result as a new column, like:

GLT_City['WeightedMonthlyMean'] = GLT_City.groupby(['City','month']).transform(wavg, 'AvgTemp','AvgTempUncert', sigma='sigma')

Now this results in a very lengthy error message copy-pasted below

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5126)()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item (pandas/_libs/hashtable.c:14010)()

TypeError: an integer is required

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-61-cef679f52b5f> in <module>()
----> 1 GLT_City['WeightedMonthlyMean'] = GLT_City.groupby(['City','month']).transform(wavg, 
'AvgTemp','AvgTemp', sigma='sigma')

~/anaconda/envs/python36/lib/python3.6/site-
packages/pandas/core/groupby.py in transform(self, func, *args, **kwargs)
   3814                 result = getattr(self, func)(*args, **kwargs)
   3815         else:
-> 3816             return self._transform_general(func, *args, **kwargs)
   3817 
   3818         # a reduction transform

~/anaconda/envs/python36/lib/python3.6/site-packages/pandas/core/groupby.py in _transform_general(self, func, *args, **kwargs)
   3765                 # Try slow path and fast path.
   3766                 try:
-> 3767                     path, res = self._choose_path(fast_path, slow_path, group)
   3768                 except TypeError:
   3769                     return self._transform_item_by_item(obj, fast_path)

~/anaconda/envs/python36/lib/python3.6/site-packages/pandas/core/groupby.py in _choose_path(self, fast_path, slow_path, group)
   3861     def _choose_path(self, fast_path, slow_path, group):
   3862         path = slow_path
-> 3863         res = slow_path(group)
   3864 
   3865         # if we make it here, test if we can use the fast path

~/anaconda/envs/python36/lib/python3.6/site-packages/pandas/core/groupby.py in <lambda>(group)
   3856             fast_path = lambda group: func(group, *args, **kwargs)
   3857             slow_path = lambda group: group.apply(
-> 3858                 lambda x: func(x, *args, **kwargs), axis=self.axis)
   3859         return fast_path, slow_path
   3860 

~/anaconda/envs/python36/lib/python3.6/site-packages/pandas/core/frame.py in apply(self, func, axis, broadcast, raw, reduce, args, **kwds)
   4260                         f, axis,
   4261                         reduce=reduce,
-> 4262                         ignore_failures=ignore_failures)
   4263             else:
   4264                 return self._apply_broadcast(f, axis)

~/anaconda/envs/python36/lib/python3.6/site-packages/pandas/core/frame.py in _apply_standard(self, func, axis, ignore_failures, reduce)
   4356             try:
   4357                 for i, v in enumerate(series_gen):
-> 4358                     results[i] = func(v)
   4359                     keys.append(v.name)
   4360             except Exception as e:

~/anaconda/envs/python36/lib/python3.6/site-packages/pandas/core/groupby.py in <lambda>(x)
   3856             fast_path = lambda group: func(group, *args, **kwargs)
   3857             slow_path = lambda group: group.apply(
-> 3858                 lambda x: func(x, *args, **kwargs), axis=self.axis)
   3859         return fast_path, slow_path
   3860 

<ipython-input-58-181ef4bb1f30> in wavg(group, data_name, weight_name, sigma)
     10 
     11     #Extracting data and weights.
---> 12     data = group[data_name]
     13     weight = group[weight_name]
     14     #Check whether we have actual weights, or measurement uncertainties

~/anaconda/envs/python36/lib/python3.6/site-packages/pandas/core/series.py in __getitem__(self, key)
    599         key = com._apply_if_callable(key, self)
    600         try:
--> 601             result = self.index.get_value(self, key)
    602 
    603             if not is_scalar(result):

~/anaconda/envs/python36/lib/python3.6/site-
packages/pandas/core/indexes/base.py in get_value(self, series, key)
   2475         try:
   2476             return self._engine.get_value(s, k,
-> 2477                                           
tz=getattr(series.dtype, 'tz', None))
   2478         except KeyError as e1:
   2479             if len(self) > 0 and self.inferred_type in ['integer', 'boolean']:

 pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value (pandas/_libs/index.c:4404)()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value (pandas/_libs/index.c:4087)()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5210)()

KeyError: ('AvgTemp', 'occurred at index AvgTemp')

So this is obviously not working, but it's not clear to me why. Any pointers/solutions would be most welcome.

I can use the apply() method, to get the desired output, but since I am averaging over groups, I can't really merge this with the original dataframe, since the series produced by apply() will be of a different size.

The transform function is applied to each group column individually. Putting a print statement inside wavg will help you see the problem:

def wavg(group,data_name,weight_name, sigma=None):
    print(group)
    ...
df['WeightedMonthlyMean'] = df.groupby(['City','month']).transform(wavg, 'AvgTemp','AvgTempUncert', sigma='sigma')

prints

1    5.788
Name: AvgTemp, dtype: object

before raising the KeyError . This shows that group is merely a Series, not the whole (group) DataFrame.

So instead, use apply , then merge the result back into df :

result = df.groupby(['City','month']).apply(wavg, 'AvgTemp','AvgTempUncert', sigma='sigma').reset_index(name='wavg')
result = pd.merge(df, result)

For example,

import pandas as pd

df = pd.DataFrame({'AvgTemp': [6.068, 5.787999999999999, 10.644, 14.050999999999998, 16.082, 12.780999999999999, 7.95, 4.638999999999999], 'AvgTempUncert': [1.7369999999999999, 3.6239999999999997, 1.2830000000000001, 1.347, 1.396, 1.454, 1.63, 1.3019999999999998], 'City': ['Århus', 'Århus', 'Århus', 'Århus', 'Århus', 'Århus', 'Århus', 'Århus'], 'Country': ['Denmark', 'Denmark', 'Denmark', 'Denmark', 'Denmark', 'Denmark', 'Denmark', 'Denmark'], 'Lat': ['57.05N', '57.05N', '57.05N', '57.05N', '57.05N', '57.05N', '57.05N', '57.05N'], 'Long': ['10.33E', '10.33E', '10.33E', '10.33E', '10.33E', '10.33E', '10.33E', '10.33E'], 'day': [1, 1, 1, 1, 1, 1, 1, 1], 'month': [11, 4, 5, 6, 7, 9, 10, 11], 'year': [1743, 1744, 1744, 1744, 1744, 1744, 1744, 1744]}) 

def wavg(group,data_name,weight_name, sigma=None):
    data = group[data_name]
    weight = group[weight_name]
    #Check whether we have actual weights or measurement uncertainties
    if sigma=='sigma':
        weight = 1./weight

    try:
        return (data * weight).sum() / weight.sum()
    except ZeroDivisionError:
        return data.mean()

result = df.groupby(['City','month']).apply(wavg, 'AvgTemp','AvgTempUncert', sigma='sigma').reset_index(name='wavg')
result = pd.merge(df, result)
print(result)

yields

   AvgTemp  AvgTempUncert   City  Country     Lat    Long  day  month  year       wavg  
0    6.068          1.737  Århus  Denmark  57.05N  10.33E    1     11  1743   5.251227   
1    4.639          1.302  Århus  Denmark  57.05N  10.33E    1     11  1744   5.251227   
2    5.788          3.624  Århus  Denmark  57.05N  10.33E    1      4  1744   5.788000   
3   10.644          1.283  Århus  Denmark  57.05N  10.33E    1      5  1744  10.644000   
4   14.051          1.347  Århus  Denmark  57.05N  10.33E    1      6  1744  14.051000   
5   16.082          1.396  Århus  Denmark  57.05N  10.33E    1      7  1744  16.082000   
6   12.781          1.454  Århus  Denmark  57.05N  10.33E    1      9  1744  12.781000   
7    7.950          1.630  Århus  Denmark  57.05N  10.33E    1     10  1744   7.950000   

How about using apply and then merge to get this into the same DataFrame ? Example:

import numpy as np
import pandas as pd
data = pd.DataFrame({'City': np.random.randint(0, 4, 1000), 'Month': np.random.randint(1, 12, 1000), 'T': np.random.randn(1000)})
pd.merge(data, data.groupby(['City', 'Month']).apply(lambda x: x['T']*2).reset_index()[['City', 'Month', 'T']].rename(columns={'T': 'WeightedT'}), left_on=['City', 'Month'], right_on=['City', 'Month'])

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM