如何加快涉及前一行和多行 pandas 的計算？

Question

我正在嘗試為 dataframe 中的每一行計算多行。

當我運行 2971000 行時，我當前的解決方案需要很長時間。 幾乎需要2個多小時。

所以，我想知道加快 function 的其他解決方案

例如，我的數據看起來像這樣。

                        sig1    sig2   sig3   sig4  sig_p   sig_t
20210114 05:52:02.00     0.0    0.0    0.0    0.0   11.5    -3.5
20210114 05:52:02.01     0.0    0.0    0.0    0.0   11.6    -3.5
20210114 05:52:02.02     0.0    0.0    0.0    0.0   11.5    -3.5
20210114 05:52:02.03     0.0    0.0    0.0    0.0   11.6    -3.5
20210114 05:52:02.04     0.0    0.0    0.0    0.0   11.7    -3.5
...                      ...    ...    ...    ...   ...     ...
20210114 22:38:59.85     0.0    0.0    0.0    0.0   0.0     -0.5
20210114 22:38:59.86     0.0    0.0    0.0    0.0   0.0     -0.5
20210114 22:38:59.87     0.0    0.0    0.0    0.0   0.0     -0.5
20210114 22:38:59.88     0.0    0.0    0.0    0.0   0.0     -0.5
20210114 22:38:59.89     0.0    0.0    0.0    0.0   0.0     -0.5

我有一個 function 循環並根據sig1 ， sig_p ， sig_t ， previous newcol計算newcol的值。 function 對sig1 、 sig2 、 sig3 、 sig4重復運行。

我將向您展示我目前擁有的代碼，但它太慢了。

parameter.py

from typing import NamedTuple
class Param(NamedTuple):
    RATIO                     : float
    D                         : float
    T                         : float
    M                         : float
    S                         : float
    W                         : float
    DYNAMIC                   : float
    
    T_CONST                   : float
    P_CONST                   : float
    
    L_COEF                    : float
    O_COEF                    : float
    
    
    @property
    def A(self):
        return (self.D**2)*math.pi
    @property
    def FACTOR(self):
        return self.S / self.A

Param1 = Param(
    RATIO                     = 0.74,
    
    D                         = 172e-3,
    T                         = 23e-3,
    M                         = 6,
    
    S                         = 53.7e-4,#4232.5e-6,
    W                         = 0.805,
    DYNAMIC                   = 0.3150,
    
    T_CONST                   = 2, #4,
    P_CONST                   = 0.2,#3,
    
    
    L_COEF                    = 0.8,#4,
    O_COEF                    = 2.5
)

rear = Param(
    RATIO                     = 0.26,
    
    D                         = 204e-3,
    T                         = 10e-3,
    M                         = 4,
    
    S                         = 26.8e-4,
    W                         = 0.38,
    
    DYNAMIC                   = 0.3150,
        
    T_CONST                   = 1.8,
    P_CONST                   = 0.2,
    
    L_COEF                    = 0.2,
    O_COEF                    = 1.8
)

test.py

import pandas as pd
import numpy as np
from scipy.interpolate import interp1d


TIME_STAMP = 0.1
SPEC = 449
SPECIFIC = 935

EMISSIVITY                      = 0.7
ABSORBTIVITY                    = 0.3

DYNAMIC_SPEED = 12

COEFFICIENT = 0.9506173967164384

input_KV = [-75, -50, -25, -15, -10, -5, 0, 5, 10, 15, 20, 25, 30, 40, 50, 60,
                        80, 100, 125, 150, 175, 200, 225, 300, 412, 500, 600, 700, 800, 900, 1000, 1100]

viscosity_value = [7.4, 9.22, 11.18, 12.01, 12.43, 12.85, 13.28, 13.72, 14.16, 14.61, 15.06, 15.52, 15.98, 16.92, 17.88, 18.86, 20.88,
               22.97, 25.69, 28.51, 31.44, 34.47, 37.6, 47.54, 63.82, 77.72, 94.62, 112.6, 131.7, 151.7, 172.7, 194.6]
               
input_ka = [-190, -150, -100, -75, -50, -25, -15, -10, -5, 0, 5, 10, 15, 20, 25, 30, 40,
                     50, 60, 80, 100, 125, 150, 175, 200, 225, 300, 412, 500, 600, 700, 800, 900, 1000, 1100]

conductivity_value = [7.82, 11.69, 16.2, 18.34, 20.41, 22.41, 23.2, 23.59, 23.97, 24.36, 24.74, 25.12, 25.5, 25.87, 26.24, 26.62,
                    27.35, 28.08, 28.8, 30.23, 31.62, 33.33, 35, 36.64, 38.25, 39.83, 44.41, 50.92, 55.79, 61.14, 66.32, 71.35, 76.26, 81.08, 85.83] 

def viscosity(input):
    fq = interp1d(input_KV,
                  viscosity_value, kind='quadratic')
    return (fq(input)*10e-6)

def conductivity(input):
    fq = interp1d(input_ka,
                  conductivity_value, kind='quadratic')
    return (fq(input)*10e-3)                              
    
def calculation(Param, sig, sig_p, sig_t):
    new_col1 = np.empty(len(sig_p))
    new_col1[0] = sig_t[0]

    my_goal = np.empty(len(sig_p))
    my_goal[0] = sig_t[0]
            
      
    calc1 = COEFFICIENT * Param.RATIO * sig_p * sig /2

    for n in range(1, len(sig_p)):
        calc2 = EMISSIVITY * Param.A * (new_col1[n-1]**4 - sig_t[n]**4)
        
        Ka = conductivity(sig_t[n])
        if sig[n] == 0:
            h = Param.O_COEF
        else :
            KV = viscosity(sig_t[n])
            
            if sig[n] < DYNAMIC_SPEED:
                h = (0.7*(sig[n]/KV)**0.4) * Ka + Param.O_COEF
            else :
                h = (0.04*(sig[n])/KV**0.8) * Ka + Param.L_COEF
            
        calc3 = h * Param.A * (new_col1[n-1] - sig_t[n])
        calc4 = Ka *Param.A * (new_col1[n-1] - sig_t[n]) / Param.T

        a1 = (calc1[n] - (calc2 + calc3 + calc4)) / (SPEC * Param.M)
        new_col1[n] = new_col1[n-1] + a1 * TIME_STAMP
        
        if sig_p[n] == 0 :
            val1 = ABSORBTIVITY * Param.FACTOR * calc2
        elif (sig_p[n] > 0) & (sig_p[n] <= 20):
            val1 = ABSORBTIVITY * Param.FACTOR * calc2* (20-sig_p[n])/20 + ((1-COEFFICIENT) * calc1[n] / (4)) * sig_p[n] / 20
        else:
            val1 = (1-COEFFICIENT) * calc1[n] / 4
            
        if sig[n] == 0:
            val2 = Param.T_CONST
        else:
            h_bar = Param.P_CONST * (sig[n] *Param.DYNAMIC)**0.8
            val2 = h_bar * Param.S * (my_goal[n-1] - sig_t[n])
            
        a2 = (val1 - (val2)) / (SPECIFIC * Param.W)
        my_goal[n] = my_goal[n-1] + a2 * TIME_STAMP
        if my_goal[n] < sig_t[0] : my_goal[n] = sig_t[0]
            
    return my_goal

df = pd.read_csv('data.csv', index_col=0)

df['newcol1'] = calculation(Param1, df['sig1'].values, df[sig_p].values, df['sig_t'].values)
df['newcol2'] = calculation(Param1, df['sig2'].values, df[sig_p].values, df['sig_t'].values)
df['newcol3'] = calculation(Param2, df['sig3'].values, df[sig_p].values, df['sig_t'].values)
df['newcol4'] = calculation(Param2, df['sig4'].values, df[sig_p].values, df['sig_t'].values)

我現在需要將此 function 應用到幾百萬行，而且速度非常慢，所以我試圖找出加速它的最佳方法。 我聽說 Cython 可以提高函數的速度，但我沒有這方面的經驗（而且我對 pandas 和 python 都是新手）。

我的問題是是否有任何方法可以增強或加速這種計算方法。

我在 AWS（sagemaker>notebook 實例，jupyter）上運行這個 python 代碼，我的計算機操作系統是 window。

Answer 1

迭代很容易編碼，但對於 dataframe 來說很慢。 這是您解決方案的提示。 您需要對 while 循環內的代碼進行矢量化while n < len(sig_p): 。 例如，以前您的代碼：

def fun(Param, sig_p, sig, sig_t):
    tempvalue =   np.empty(sig_p.shape)
    tempvalue[0] = sig_t[0]
    newcol = np.empty(sig_p.shape)
    newcol[0] = sig_t[0]

    n = 1
    while n < len(sig_p):
        # calc1 = fun1()
        calc1 = Param.COEF * (sig_p[n]) * Param.NO * Param.EFF # fun1()

        # calc2 = fun2()
        if sig[n] > Param.THRESHOLD:
            calc2 = 0
        else:
            calc2 = Param.EMISSIVITY * Param.CONSTANT * (tempvalue[n-1]**4 - sig_t[n]**4)

        # calc3
        # calc4
        # ......

df['newcol1'] = fun(param1, df['sig_p'].values, df['sig1'].values, df['sig_t'].values)

為了消除 while 循環，fun1() 和 fun2 可以這樣重寫：

def fun(Param, df, sigTag):
    # df['calc1'] = vectorized fun1()
    df['calc1'] = Param.COEF * df['sig_p'] * Param.NO * Param.EFF  

    # df['calc2'] = vectorized fun2()
    df['calc2'] = Param.EMISSIVITY * Param.CONSTANT * (df['sig_t'].shift(1)**4 - df['sig_t']**4)
    df.loc[df[sigTag] > Param.THRESHOLD, 'calc2'] = 0

    # df['calc3'] = vectorized fun3()
    # df['calc4'] = vectorized fun4()
    # ......
    
df['newcol1'] = fun(param1, df, 'sig1')

您可能還想將 dataframe 輸入到 fun() 而不是單獨的 ndarray(s) 中。

這種方法會大大提高性能。 您可能想對如何向量化計算進行一些研究。

如何加快涉及前一行和多行 pandas 的計算？

問題描述

1 個解決方案

解決方案1
0 已采納 2021-04-02 13:08:23

如何加快涉及前一行和多行 pandas 的計算？

問題描述

1 個解決方案

解決方案1 0 已采納 2021-04-02 13:08:23

解決方案1
0 已采納 2021-04-02 13:08:23