[英]cuFFT and streams
我正在嘗試使用流異步啟動多個CUDA FFT內核。 為此,我正在創建我的流,cuFFT前向和反向計划如下:
streams = (cudaStream_t*) malloc(sizeof(cudaStream_t)*streamNum);
plansF = (cufftHandle *) malloc(sizeof(cufftHandle)*streamNum);
plansI = (cufftHandle *) malloc(sizeof(cufftHandle)*streamNum);
for(int i=0; i<streamNum; i++)
{
cudaStreamCreate(&streams[i]);
CHECK_ERROR(5)
cufftPlan1d(&plansF[i], ticks, CUFFT_R2C,1);
CHECK_ERROR(5)
cufftPlan1d(&plansI[i], ticks, CUFFT_C2R,1);
CHECK_ERROR(5)
cufftSetStream(plansF[i],streams[i]);
CHECK_ERROR(5)
cufftSetStream(plansI[i],streams[i]);
CHECK_ERROR(5)
}
在main
函數中,我正在啟動正向FFT,如下所示:
for(w=1;w<q;w++)
{
cufftExecR2C(plansF[w], gpuMem1+k,gpuMem2+j);
CHECK_ERROR(8)
k += rect_small_real;
j += rect_small_complex;
}
我還有其他內核,我使用相同的流異步啟動。
當我使用Visual Profiler 5.0分析我的應用程序時,我發現除了CUDA FFT(正向和反向)之外的所有內核並行運行並重疊。 FFT內核確實在不同的流中運行,但它們不重疊,因為它們實際上是順序運行的。 誰能告訴我我的問題是什么?
我的環境是VS 2008,64位,Windows 7。
謝謝。
這是在Kepler體系結構中使用CUDA中的流的cuFFT執行和memcopies的工作示例。
這是代碼:
#include <stdio.h>
#include <cufft.h>
#define NUM_STREAMS 3
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/********/
/* MAIN */
/********/
int main()
{
const int N = 5000;
// --- Host input data initialization
float2 *h_in1 = new float2[N];
float2 *h_in2 = new float2[N];
float2 *h_in3 = new float2[N];
for (int i = 0; i < N; i++) {
h_in1[i].x = 1.f;
h_in1[i].y = 0.f;
h_in2[i].x = 1.f;
h_in2[i].y = 0.f;
h_in3[i].x = 1.f;
h_in3[i].y = 0.f;
}
// --- Host output data initialization
float2 *h_out1 = new float2[N];
float2 *h_out2 = new float2[N];
float2 *h_out3 = new float2[N];
for (int i = 0; i < N; i++) {
h_out1[i].x = 0.f;
h_out1[i].y = 0.f;
h_out2[i].x = 0.f;
h_out2[i].y = 0.f;
h_out3[i].x = 0.f;
h_out3[i].y = 0.f;
}
// --- Registers host memory as page-locked (required for asynch cudaMemcpyAsync)
gpuErrchk(cudaHostRegister(h_in1, N*sizeof(float2), cudaHostRegisterPortable));
gpuErrchk(cudaHostRegister(h_in2, N*sizeof(float2), cudaHostRegisterPortable));
gpuErrchk(cudaHostRegister(h_in3, N*sizeof(float2), cudaHostRegisterPortable));
gpuErrchk(cudaHostRegister(h_out1, N*sizeof(float2), cudaHostRegisterPortable));
gpuErrchk(cudaHostRegister(h_out2, N*sizeof(float2), cudaHostRegisterPortable));
gpuErrchk(cudaHostRegister(h_out3, N*sizeof(float2), cudaHostRegisterPortable));
// --- Device input data allocation
float2 *d_in1; gpuErrchk(cudaMalloc((void**)&d_in1, N*sizeof(float2)));
float2 *d_in2; gpuErrchk(cudaMalloc((void**)&d_in2, N*sizeof(float2)));
float2 *d_in3; gpuErrchk(cudaMalloc((void**)&d_in3, N*sizeof(float2)));
float2 *d_out1; gpuErrchk(cudaMalloc((void**)&d_out1, N*sizeof(float2)));
float2 *d_out2; gpuErrchk(cudaMalloc((void**)&d_out2, N*sizeof(float2)));
float2 *d_out3; gpuErrchk(cudaMalloc((void**)&d_out3, N*sizeof(float2)));
// --- Creates CUDA streams
cudaStream_t streams[NUM_STREAMS];
for (int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamCreate(&streams[i]));
// --- Creates cuFFT plans and sets them in streams
cufftHandle* plans = (cufftHandle*) malloc(sizeof(cufftHandle)*NUM_STREAMS);
for (int i = 0; i < NUM_STREAMS; i++) {
cufftPlan1d(&plans[i], N, CUFFT_C2C, 1);
cufftSetStream(plans[i], streams[i]);
}
// --- Async memcopyes and computations
gpuErrchk(cudaMemcpyAsync(d_in1, h_in1, N*sizeof(float2), cudaMemcpyHostToDevice, streams[0]));
gpuErrchk(cudaMemcpyAsync(d_in2, h_in2, N*sizeof(float2), cudaMemcpyHostToDevice, streams[1]));
gpuErrchk(cudaMemcpyAsync(d_in3, h_in3, N*sizeof(float2), cudaMemcpyHostToDevice, streams[2]));
cufftExecC2C(plans[0], (cufftComplex*)d_in1, (cufftComplex*)d_out1, CUFFT_FORWARD);
cufftExecC2C(plans[1], (cufftComplex*)d_in2, (cufftComplex*)d_out2, CUFFT_FORWARD);
cufftExecC2C(plans[2], (cufftComplex*)d_in3, (cufftComplex*)d_out3, CUFFT_FORWARD);
gpuErrchk(cudaMemcpyAsync(h_out1, d_out1, N*sizeof(float2), cudaMemcpyDeviceToHost, streams[0]));
gpuErrchk(cudaMemcpyAsync(h_out2, d_out2, N*sizeof(float2), cudaMemcpyDeviceToHost, streams[1]));
gpuErrchk(cudaMemcpyAsync(h_out3, d_out3, N*sizeof(float2), cudaMemcpyDeviceToHost, streams[2]));
for(int i = 0; i < NUM_STREAMS; i++)
gpuErrchk(cudaStreamSynchronize(streams[i]));
// --- Releases resources
gpuErrchk(cudaHostUnregister(h_in1));
gpuErrchk(cudaHostUnregister(h_in2));
gpuErrchk(cudaHostUnregister(h_in3));
gpuErrchk(cudaHostUnregister(h_out1));
gpuErrchk(cudaHostUnregister(h_out2));
gpuErrchk(cudaHostUnregister(h_out3));
gpuErrchk(cudaFree(d_in1));
gpuErrchk(cudaFree(d_in2));
gpuErrchk(cudaFree(d_in3));
gpuErrchk(cudaFree(d_out1));
gpuErrchk(cudaFree(d_out2));
gpuErrchk(cudaFree(d_out3));
for(int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamDestroy(streams[i]));
delete[] h_in1;
delete[] h_in2;
delete[] h_in3;
delete[] h_out1;
delete[] h_out2;
delete[] h_out3;
cudaDeviceReset();
return 0;
}
請根據CUFFT錯誤處理添加cuFFT錯誤檢查。
下面,提供了在Kepler K20c卡上測試上述算法時的一些分析信息。 正如您將看到的,只有當您有足夠大的N
,才能實現計算和內存傳輸之間的真正重疊。
N = 5000
N = 50000
N = 500000
問題出在你使用的硬件上。
所有支持CUDA的GPU都能夠同時執行內核並以兩種方式復制數據。 但是,只有具有Compute Capability 3.5的設備才具有名為Hyper-Q的功能。
簡而言之,在這些GPU中實現了幾個(我認為是16個)硬件內核隊列。 在之前的GPU中,只有一個硬件隊列可用。
這意味着cudaStreams只是虛擬的,只有在重疊計算和內存復制的情況下,它們對舊硬件的使用才有意義。 當然,這不僅適用於cuFFT,也適用於您自己的內核!
請深入了解visual profiler的輸出。 您可能會無意中將時間線可視化視為GPU執行的確切數據。 然而,事情並非那么簡單。 有幾行顯示的數據可能指的是執行內核啟動線的時間點(通常是橙色的)。 此行對應於GPU上的特定內核(藍色矩形)的執行。 內存傳輸也是如此(確切的時間顯示為淺棕色矩形)。
希望,我幫你解決了你的問題。
這是@ JackOLantern代碼的一個重復段,允許輕松改變FFT的數量,FFT長度和流計數,以試驗nvvp中的GPU利用率。
// Compile with:
// nvcc --std=c++11 stream_parallel.cu -o stream_parallel -lcufft
#include <iostream>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cufft.h>
// Print file name, line number, and error code when a CUDA error occurs.
#define check_cuda_errors(val) __check_cuda_errors__ ( (val), #val, __FILE__, __LINE__ )
template <typename T>
inline void __check_cuda_errors__(T code, const char *func, const char *file, int line) {
if (code) {
std::cout << "CUDA error at "
<< file << ":" << line << std::endl
<< "error code: " << (unsigned int) code
<< " type: \"" << cudaGetErrorString(cudaGetLastError()) << "\"" << std::endl
<< "func: \"" << func << "\""
<< std::endl;
cudaDeviceReset();
exit(EXIT_FAILURE);
}
}
int main(int argc, char *argv[]) {
// Number of FFTs to compute.
const int NUM_DATA = 64;
// Length of each FFT.
const int N = 1048576;
// Number of GPU streams across which to distribute the FFTs.
const int NUM_STREAMS = 4;
// Allocate and initialize host input data.
float2 **h_in = new float2 *[NUM_STREAMS];
for (int ii = 0; ii < NUM_STREAMS; ii++) {
h_in[ii] = new float2[N];
for (int jj = 0; jj < N; ++jj) {
h_in[ii][jj].x = (float) 1.f;
h_in[ii][jj].y = (float) 0.f;
}
}
// Allocate and initialize host output data.
float2 **h_out = new float2 *[NUM_STREAMS];
for (int ii = 0; ii < NUM_STREAMS; ii++) {
h_out[ii] = new float2[N];
for (int jj = 0; jj < N; ++jj) {
h_out[ii][jj].x = 0.f;
h_out[ii][jj].y = 0.f;
}
}
// Pin host input and output memory for cudaMemcpyAsync.
for (int ii = 0; ii < NUM_STREAMS; ii++) {
check_cuda_errors(cudaHostRegister(h_in[ii], N*sizeof(float2), cudaHostRegisterPortable));
check_cuda_errors(cudaHostRegister(h_out[ii], N*sizeof(float2), cudaHostRegisterPortable));
}
// Allocate pointers to device input and output arrays.
float2 **d_in = new float2 *[NUM_STREAMS];
float2 **d_out = new float2 *[NUM_STREAMS];
// Allocate intput and output arrays on device.
for (int ii = 0; ii < NUM_STREAMS; ii++) {
check_cuda_errors(cudaMalloc((void**)&d_in[ii], N*sizeof(float2)));
check_cuda_errors(cudaMalloc((void**)&d_out[ii], N*sizeof(float2)));
}
// Create CUDA streams.
cudaStream_t streams[NUM_STREAMS];
for (int ii = 0; ii < NUM_STREAMS; ii++) {
check_cuda_errors(cudaStreamCreate(&streams[ii]));
}
// Creates cuFFT plans and sets them in streams
cufftHandle* plans = (cufftHandle*) malloc(sizeof(cufftHandle)*NUM_STREAMS);
for (int ii = 0; ii < NUM_STREAMS; ii++) {
cufftPlan1d(&plans[ii], N, CUFFT_C2C, 1);
cufftSetStream(plans[ii], streams[ii]);
}
// Fill streams with async memcopies and FFTs.
for (int ii = 0; ii < NUM_DATA; ii++) {
int jj = ii % NUM_STREAMS;
check_cuda_errors(cudaMemcpyAsync(d_in[jj], h_in[jj], N*sizeof(float2), cudaMemcpyHostToDevice, streams[jj]));
cufftExecC2C(plans[jj], (cufftComplex*)d_in[jj], (cufftComplex*)d_out[jj], CUFFT_FORWARD);
check_cuda_errors(cudaMemcpyAsync(h_out[jj], d_out[jj], N*sizeof(float2), cudaMemcpyDeviceToHost, streams[jj]));
}
// Wait for calculations to complete.
for(int ii = 0; ii < NUM_STREAMS; ii++) {
check_cuda_errors(cudaStreamSynchronize(streams[ii]));
}
// Free memory and streams.
for (int ii = 0; ii < NUM_STREAMS; ii++) {
check_cuda_errors(cudaHostUnregister(h_in[ii]));
check_cuda_errors(cudaHostUnregister(h_out[ii]));
check_cuda_errors(cudaFree(d_in[ii]));
check_cuda_errors(cudaFree(d_out[ii]));
delete[] h_in[ii];
delete[] h_out[ii];
check_cuda_errors(cudaStreamDestroy(streams[ii]));
}
delete plans;
cudaDeviceReset();
return 0;
}
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.