[英]Walk Forward with validation window for time series data cross validation
I'm looking to perform walk forward validation on my time-series data.我希望对我的时间序列数据执行前向验证。 Extensive document exists on how to perform rolling window :
有关如何执行滚动 window的大量文档:
or expanding window或扩展 window
But this validation does not correspond to what will be in my production system: I want to daily retrain a model that will make prediction 14 days in the future.但是这个验证并不对应于我的生产系统中的内容:我想每天重新训练一个 model 将在未来 14 天进行预测。 So I would only add one day of data to my previous training period (where the other methods add on the following training folds an entire set of data of length
test_size
; 14 days in my case).所以我只会在我之前的训练期间添加一天的数据(其他方法在以下训练中添加的数据会折叠一整组长度为
test_size
的数据;在我的情况下为 14 天)。 Therefore, I would like to validate my model with a sliding window :因此,我想用滑动 window 验证我的 model :
My question is that I can't come across a Python library that would do the work.我的问题是我找不到可以完成这项工作的Python 库。 TimeSeriesSplit from sklearn has no option of that kind.
sklearn 的TimeSeriesSplit没有这种选择。 Basically I want to provide:
基本上我想提供:
test_size
, n_fold
, min_train_size
and test_size
, n_fold
, min_train_size
和
if n_fold > (n_samples - min_train_size) % test_size
then next training_set
draw data from the previous fold test_set
如果
n_fold > (n_samples - min_train_size) % test_size
然后下一个training_set
从前一个折叠test_set
中提取数据
Looks like your requirement is make the test size as more than 1 fold.看起来您的要求是使测试大小超过 1 倍。 To make that change you need to tweak these lines.
要进行更改,您需要调整这些行。
I have made those changes and added a new param called n_test_folds
, so that it can be customizable.我进行了这些更改并添加了一个名为
n_test_folds
的新参数,以便可以对其进行自定义。
from sklearn.model_selection._split import TimeSeriesSplit
from sklearn.utils.validation import _deprecate_positional_args
from sklearn.utils import indexable
from sklearn.utils.validation import _num_samples
class WindowedTestTimeSeriesSplit(TimeSeriesSplit):
"""
parameters
----------
n_test_folds: int
number of folds to be used as testing at each iteration.
by default, 1.
"""
@_deprecate_positional_args
def __init__(self, n_splits=5, *, max_train_size=None, n_test_folds=1):
super().__init__(n_splits,
max_train_size=max_train_size)
self.n_test_folds=n_test_folds
def split(self, X, y=None, groups=None):
"""Generate indices to split data into training and test set.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training data, where n_samples is the number of samples
and n_features is the number of features.
y : array-like of shape (n_samples,)
Always ignored, exists for compatibility.
groups : array-like of shape (n_samples,)
Always ignored, exists for compatibility.
Yields
------
train : ndarray
The training set indices for that split.
test : ndarray
The testing set indices for that split.
"""
X, y, groups = indexable(X, y, groups)
n_samples = _num_samples(X)
n_splits = self.n_splits
n_folds = n_splits + self.n_test_folds
if n_folds > n_samples:
raise ValueError(
("Cannot have number of folds ={0} greater"
" than the number of samples: {1}.").format(n_folds,
n_samples))
indices = np.arange(n_samples)
fold_size = (n_samples // n_folds)
test_size = fold_size * self.n_test_folds # test window
test_starts = range(fold_size + n_samples % n_folds,
n_samples-test_size+1, fold_size) # splits based on fold_size instead of test_size
for test_start in test_starts:
if self.max_train_size and self.max_train_size < test_start:
yield (indices[test_start - self.max_train_size:test_start],
indices[test_start:test_start + test_size])
else:
yield (indices[:test_start],
indices[test_start:test_start + test_size])
Example:例子:
import numpy as np
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([1, 2, 3, 4, 5, 6])
tscv = WindowedTestTimeSeriesSplit(n_splits=4, n_test_folds=2)
print(tscv)
for train_index, test_index in tscv.split(X):
print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
# WindowedTestTimeSeriesSplit(max_train_size=None, n_splits=4, n_test_folds=2)
# TRAIN: [0] TEST: [1 2]
# TRAIN: [0 1] TEST: [2 3]
# TRAIN: [0 1 2] TEST: [3 4]
# TRAIN: [0 1 2 3] TEST: [4 5]
Note: TRAIN: [0 1 2 3 4] TEST: [5] was not generated because it doesn't satisfy the requirement of number of test folds.注意: TRAIN: [0 1 2 3 4] TEST: [5] 未生成,因为它不满足测试折叠次数的要求。
Using this function, we can visualize the different splits of our CV.使用这个function,我们可以可视化 CV 的不同拆分。
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
np.random.seed(1338)
cmap_data = plt.cm.Paired
cmap_cv = plt.cm.coolwarm
n_splits = 4
# Generate the class/group data
n_points = 100
X = np.random.randn(100, 10)
percentiles_classes = [.1, .3, .6]
y = np.hstack([[ii] * int(100 * perc)
for ii, perc in enumerate(percentiles_classes)])
# Evenly spaced groups repeated once
groups = np.hstack([[ii] * 10 for ii in range(10)])
fig, ax = plt.subplots()
cv = WindowedTestTimeSeriesSplit(n_splits=n_splits, n_test_folds=2)
plot_cv_indices(cv, X, y, groups, ax, n_splits)
plt.show()
Here is my solution that allows the user to specify the testing horizon and the minimum sample of data for training:这是我的解决方案,它允许用户指定测试范围和训练的最小数据样本:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.utils import indexable
from sklearn.utils.validation import _num_samples
class TimeSeriesSplitCustom(TimeSeriesSplit):
def __init__(self, n_splits=5, max_train_size=None,
test_size=1,
min_train_size=1):
super().__init__(n_splits=n_splits, max_train_size=max_train_size)
self.test_size = test_size
self.min_train_size = min_train_size
def overlapping_split(self, X, y=None, groups=None):
min_train_size = self.min_train_size
test_size = self.test_size
n_splits = self.n_splits
n_samples = _num_samples(X)
if (n_samples - min_train_size) / test_size >= n_splits:
print('(n_samples - min_train_size) / test_size >= n_splits')
print('default TimeSeriesSplit.split() used')
yield from super().split(X)
else:
shift = int(np.floor(
(n_samples - test_size - min_train_size) / (n_splits - 1)))
start_test = n_samples - (n_splits * shift + test_size - shift)
test_starts = range(start_test, n_samples - test_size + 1, shift)
if start_test < min_train_size:
raise ValueError(
("The start of the testing : {0} is smaller"
" than the minimum training samples: {1}.").format(start_test,
min_train_size))
indices = np.arange(n_samples)
for test_start in test_starts:
if self.max_train_size and self.max_train_size < test_start:
yield (indices[test_start - self.max_train_size:test_start],
indices[test_start:test_start + test_size])
else:
yield (indices[:test_start],
indices[test_start:test_start + test_size])
And with the visualisation:并通过可视化:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from ModelEvaluation import TimeSeriesSplitCustom
np.random.seed(1338)
cmap_data = plt.cm.Paired
cmap_cv = plt.cm.coolwarm
n_splits = 13
# Generate the class/group data
n_points = 100
X = np.random.randn(100, 10)
percentiles_classes = [.1, .3, .6]
y = np.hstack([[ii] * int(100 * perc)
for ii, perc in enumerate(percentiles_classes)])
# Evenly spaced groups repeated once
groups = np.hstack([[ii] * 10 for ii in range(10)])
fig, ax = plt.subplots()
cv = TimeSeriesSplitCustom(n_splits=n_splits, test_size=20, min_train_size=12)
plot_cv_indices(cv, X, y, groups, ax, n_splits)
plt.show()
(To have the same result, make sure to change the (要获得相同的结果,请确保更改
for ii, (tr, tt) in enumerate(**cv.overlapping_split**(X=X, y=y, groups=group)):
in the plot_cv_indices
function.在
plot_cv_indices
function 中。
Cheers!干杯!
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.