[英]Time Series prediction for python dataframe
我正在處理如下所示的代碼:
df=pd.read_csv("file.csv")
df['fraction'] = df ['number'] / df['year_total']
df.fraction = df.fraction.round(4)
df
給出輸出為
programming_lang = ["r", "python", "c#", "java", "JavaScript", "php", "c++", "ruby", "Selenium"]
yearly_top = df[df['tag'].isin(programming_lang)]
yearly_top
給出如下輸出:
year, tag, number, year_total, fraction
2008, java, 7473, 58390,0.1280
2008, php, 3111, 58390, 0.0533
2008, Python, 2080, 58390, 0.0356
......
2019, java, 83841, 1085170, 0.0773
2019, php, 61257, 1085170, 0.0564
2019, python, 107348, 1085170, 0.0989
它包含了 2008 年到 2019 年的頂級編程語言數據。我想使用時間序列模型來預測這些編程語言在 2020 年、2021 年和 2022 年的fraction
值。我對這個領域很陌生。 任何線索都會有所幫助
您可以使用 RNN 解決它。 首先,讓我們創建一個示例數據框來使用
import pandas as pd
import numpy as np
test_df = pd.DataFrame({'year':range(2008,2020)})
# 0-java, 1-php, 2-python
for ind in range(3): test_df['frac_%i' % ind] = np.random.rand(2020-2008)
test_df = test_df.drop('year',axis=1)
# the array of fractions
data = test_df.values
在刪除列year
之前, test_df
看起來像
year frac_0 frac_1 frac_2
0 2008 0.457123 0.780754 0.978396
1 2009 0.578795 0.323664 0.909824
2 2010 0.707996 0.477242 0.948976
3 2011 0.455918 0.627572 0.137039
4 2012 0.272352 0.144968 0.831693
5 2013 0.064729 0.233168 0.554654
6 2014 0.754608 0.570530 0.968355
7 2015 0.435918 0.264335 0.727189
8 2016 0.699624 0.455323 0.237246
9 2017 0.824758 0.995260 0.333113
10 2018 0.597993 0.384319 0.750074
11 2019 0.598657 0.533934 0.072334
在使用 RNN 進行時間序列分析時,首先要將任務轉換為監督回歸任務,即我們需要創建一個數據框,其中每一行都是
observations of the past year | observation of a year
這是一個可以幫助您實現此目的的功能(我從這篇精彩的帖子中學到了這個功能)
def series_to_supervised(data,n_in,n_out):
df = pd.DataFrame(data)
cols = list()
for i in range(n_in,0,-1): cols.append(df.shift(i))
for i in range(0, n_out): cols.append(df.shift(-i))
agg = pd.concat(cols,axis=1)
agg.dropna(inplace=True)
return agg.values
使用此函數,我們可以創建所需的數據幀
n_in,n_out = 2,1
data = series_to_supervised(test_df,n_in,n_out)
n_in
是我們想要用來進行預測的過n_out
數, n_out
是我們想要預測的年數。 在這種情況下,我們僅根據過去兩年的數據預測一年。
現在我們已經准備好了數據,我們可以訓練一個 RNN 模型
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model,Sequential
from tensorflow.keras.layers import Dense,LSTM,Dropout
x, y= data[:,None,:-n_out*3],data[:,n_in*3:]
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.1,random_state=49)
model = Sequential()
model.add(LSTM(4,name='lstm_0'))
model.add(Dropout(0.2,name='dropout_0'))
model.add(Dense(3,activation='tanh'))
model.compile(loss='mse',optimizer='adam',metrics=['mse'])
# fit
history = model.fit(x_train,y_train,validation_data=(x_test,y_test),epochs=50,verbose=0)
使用此模型,您可以通過以下方式預測 2020、2021 和 2022 年的分數
# predict 2020 with 2018 and 2019
last_two_years = np.hstack((test_df.values[-2],test_df.values[-1]))[None,None,:]
frac_2020 = model.predict(last_two_years)
# predict 2021 with 2019 and 2020
last_two_years = np.hstack((test_df.values[-1],frac_2020.ravel()))[None,None,:]
frac_2021 = model.predict(last_two_years)
# predict 2022 with 2020 and 2021
last_two_years = np.hstack((frac_2020.ravel(),frac_2021.ravel()))[None,None,:]
frac_2022 = model.predict(last_two_years)
完整的腳本
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model,Sequential
from tensorflow.keras.layers import Dense,LSTM,Dropout
def series_to_supervised(data,n_in,n_out):
df = pd.DataFrame(data)
cols = list()
for i in range(n_in,0,-1): cols.append(df.shift(i))
for i in range(0, n_out): cols.append(df.shift(-i))
agg = pd.concat(cols,axis=1)
agg.dropna(inplace=True)
return agg.values
test_df = pd.DataFrame({'year':range(2008,2020)})
# 0-java, 1-php, 2-python
for ind in range(3): test_df['frac_%i' % ind] = np.random.rand(2020-2008)
test_df = test_df.drop('year',axis=1)
# the array of fractions
data = test_df.values
# cast the task as a supevised regression task
n_in,n_out = 2,1
data = series_to_supervised(test_df,n_in,n_out)
# train test split
x, y= data[:,None,:-n_out*3],data[:,n_in*3:]
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.1,random_state=49)
model = Sequential()
model.add(LSTM(4,name='lstm_0'))
model.add(Dropout(0.2,name='dropout_0'))
model.add(Dense(3,activation='tanh'))
model.compile(loss='mse',optimizer='adam',metrics=['mse'])
# fit
history = model.fit(x_train,y_train,validation_data=(x_test,y_test),epochs=50,verbose=0)
# predict 2020 with 2018 and 2019
last_two_years = np.hstack((test_df.values[-2],test_df.values[-1]))[None,None,:]
frac_2020 = model.predict(last_two_years)
# predict 2021 with 2019 and 2020
last_two_years = np.hstack((test_df.values[-1],frac_2020.ravel()))[None,None,:]
frac_2021 = model.predict(last_two_years)
# predict 2022 with 2020 and 2021
last_two_years = np.hstack((frac_2020.ravel(),frac_2021.ravel()))[None,None,:]
frac_2022 = model.predict(last_two_years)
print(frac_2020,frac_2021,frac_2022)
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.