简体   繁体   English


[英]cuFFT and streams

I'm trying to launch multiple CUDA FFT kernels asynchronously using streams. 我正在尝试使用流异步启动多个CUDA FFT内核。 For that, I'm creating my streams, cuFFT forward and inverse plans as follows: 为此,我正在创建我的流,cuFFT前向和反向计划如下:

streams = (cudaStream_t*) malloc(sizeof(cudaStream_t)*streamNum);
plansF = (cufftHandle *) malloc(sizeof(cufftHandle)*streamNum);
plansI = (cufftHandle *) malloc(sizeof(cufftHandle)*streamNum);
for(int i=0; i<streamNum; i++)  
    cufftPlan1d(&plansF[i], ticks, CUFFT_R2C,1);
    cufftPlan1d(&plansI[i], ticks, CUFFT_C2R,1);

In the main function, I'm launching forward FFTs as follows: main函数中,我正在启动正向FFT,如下所示:

    cufftExecR2C(plansF[w], gpuMem1+k,gpuMem2+j);
    k += rect_small_real;
    j += rect_small_complex;

I also have other kernels that I launch asynchronously with the same streams. 我还有其他内核,我使用相同的流异步启动。

When I profile my application using Visual Profiler 5.0, I see that all kernels except the CUDA FFT (both forward and inverse) run in parallel and overlap. 当我使用Visual Profiler 5.0分析我的应用程序时,我发现除了CUDA FFT(正向和反向)之外的所有内核并行运行并重叠。 FFT kernels do run in different streams, but they do not overlap, as they actually run sequentially. FFT内核确实在不同的流中运行,但它们不重叠,因为它们实际上是顺序运行的。 Can anyone tell me what is my problem? 谁能告诉我我的问题是什么?

My environment is VS 2008, 64 bit, Windows 7. 我的环境是VS 2008,64位,Windows 7。

Thanks. 谢谢。

This is a worked example of cuFFT execution and memcopies using streams in CUDA on the Kepler architecture. 这是在Kepler体系结构中使用CUDA中的流的cuFFT执行和memcopies的工作示例。

Here is the code: 这是代码:

#include <stdio.h>

#include <cufft.h>

#define NUM_STREAMS 3

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
   if (code != cudaSuccess) 
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);

/* MAIN */
int main()
    const int N = 5000;

    // --- Host input data initialization
    float2 *h_in1 = new float2[N];
    float2 *h_in2 = new float2[N];
    float2 *h_in3 = new float2[N];
    for (int i = 0; i < N; i++) {
        h_in1[i].x = 1.f;
        h_in1[i].y = 0.f;
        h_in2[i].x = 1.f;
        h_in2[i].y = 0.f;
        h_in3[i].x = 1.f;
        h_in3[i].y = 0.f;

    // --- Host output data initialization
    float2 *h_out1 = new float2[N];
    float2 *h_out2 = new float2[N];
    float2 *h_out3 = new float2[N];
    for (int i = 0; i < N; i++) {
        h_out1[i].x = 0.f;
        h_out1[i].y = 0.f;
        h_out2[i].x = 0.f;
        h_out2[i].y = 0.f;
        h_out3[i].x = 0.f;
        h_out3[i].y = 0.f;

    // --- Registers host memory as page-locked (required for asynch cudaMemcpyAsync)
    gpuErrchk(cudaHostRegister(h_in1, N*sizeof(float2), cudaHostRegisterPortable));
    gpuErrchk(cudaHostRegister(h_in2, N*sizeof(float2), cudaHostRegisterPortable));
    gpuErrchk(cudaHostRegister(h_in3, N*sizeof(float2), cudaHostRegisterPortable));
    gpuErrchk(cudaHostRegister(h_out1, N*sizeof(float2), cudaHostRegisterPortable));
    gpuErrchk(cudaHostRegister(h_out2, N*sizeof(float2), cudaHostRegisterPortable));
    gpuErrchk(cudaHostRegister(h_out3, N*sizeof(float2), cudaHostRegisterPortable));

    // --- Device input data allocation
    float2 *d_in1;          gpuErrchk(cudaMalloc((void**)&d_in1, N*sizeof(float2)));
    float2 *d_in2;          gpuErrchk(cudaMalloc((void**)&d_in2, N*sizeof(float2)));
    float2 *d_in3;          gpuErrchk(cudaMalloc((void**)&d_in3, N*sizeof(float2)));
    float2 *d_out1;         gpuErrchk(cudaMalloc((void**)&d_out1, N*sizeof(float2)));
    float2 *d_out2;         gpuErrchk(cudaMalloc((void**)&d_out2, N*sizeof(float2)));
    float2 *d_out3;         gpuErrchk(cudaMalloc((void**)&d_out3, N*sizeof(float2)));

    // --- Creates CUDA streams
    cudaStream_t streams[NUM_STREAMS];
    for (int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamCreate(&streams[i]));

    // --- Creates cuFFT plans and sets them in streams
    cufftHandle* plans = (cufftHandle*) malloc(sizeof(cufftHandle)*NUM_STREAMS);
    for (int i = 0; i < NUM_STREAMS; i++) {
        cufftPlan1d(&plans[i], N, CUFFT_C2C, 1);
        cufftSetStream(plans[i], streams[i]);

    // --- Async memcopyes and computations
    gpuErrchk(cudaMemcpyAsync(d_in1, h_in1, N*sizeof(float2), cudaMemcpyHostToDevice, streams[0]));
    gpuErrchk(cudaMemcpyAsync(d_in2, h_in2, N*sizeof(float2), cudaMemcpyHostToDevice, streams[1]));
    gpuErrchk(cudaMemcpyAsync(d_in3, h_in3, N*sizeof(float2), cudaMemcpyHostToDevice, streams[2]));
    cufftExecC2C(plans[0], (cufftComplex*)d_in1, (cufftComplex*)d_out1, CUFFT_FORWARD);
    cufftExecC2C(plans[1], (cufftComplex*)d_in2, (cufftComplex*)d_out2, CUFFT_FORWARD);
    cufftExecC2C(plans[2], (cufftComplex*)d_in3, (cufftComplex*)d_out3, CUFFT_FORWARD);
    gpuErrchk(cudaMemcpyAsync(h_out1, d_out1, N*sizeof(float2), cudaMemcpyDeviceToHost, streams[0]));
    gpuErrchk(cudaMemcpyAsync(h_out2, d_out2, N*sizeof(float2), cudaMemcpyDeviceToHost, streams[1]));
    gpuErrchk(cudaMemcpyAsync(h_out3, d_out3, N*sizeof(float2), cudaMemcpyDeviceToHost, streams[2]));

    for(int i = 0; i < NUM_STREAMS; i++)

    // --- Releases resources

    for(int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamDestroy(streams[i]));

    delete[] h_in1;
    delete[] h_in2;
    delete[] h_in3;
    delete[] h_out1;
    delete[] h_out2;
    delete[] h_out3;


    return 0;

Please, add cuFFT error check according to CUFFT error handling . 请根据CUFFT错误处理添加cuFFT错误检查。

Below, some profiling information when testing the above algorithm on a Kepler K20c card is provided. 下面,提供了在Kepler K20c卡上测试上述算法时的一些分析信息。 As you will see, you will achieve a true overlap between computation and memory transfers only provided that you have a sufficiently large N . 正如您将看到的,只有当您有足够大的N ,才能实现计算和内存传输之间的真正重叠。

N = 5000


N = 50000


N = 500000


The problem is in the hardware you use. 问题出在你使用的硬件上。

All CUDA capable GPUs are capable of executing a kernel and copying data in both ways concurrently. 所有支持CUDA的GPU都能够同时执行内核并以两种方式复制数据。 However, only devices with Compute Capability 3.5 have the feature named Hyper-Q . 但是,只有具有Compute Capability 3.5的设备才具有名为Hyper-Q的功能。

Briefly, in these GPU's several (16 I suppose) hardware kernel queues are implemented. 简而言之,在这些GPU中实现了几个(我认为是16个)硬件内核队列。 In previous GPU's one one hardware queue is available. 在之前的GPU中,只有一个硬件队列可用。

This means that cudaStreams are only virtual and their usage for old hardware makes sense only in case of overlapping computations and memory copying. 这意味着cudaStreams只是虚拟的,只有在重叠计算和内存复制的情况下,它们对旧硬件的使用才有意义。 Of course this is valid not only for cuFFT but also for your own kernels too! 当然,这不仅适用于cuFFT,也适用于您自己的内核!

Please look deeply inside the output of visual profiler. 请深入了解visual profiler的输出。 You may unintentionally think of the timeline visualization as of the exact data for GPU execution. 您可能会无意中将时间线可视化视为GPU执行的确切数据。 However it is not that simple. 然而,事情并非那么简单。 There're several lines in which displayed data may refer to timepoint in which the kernel launch line was executed (usually orange ones). 有几行显示的数据可能指的是执行内核启动线的时间点(通常是橙色的)。 And this line correspond to execution of specific kernel on GPU (blue rectangles). 此行对应于GPU上的特定内核(蓝色矩形)的执行。 The same is for memory transfers (the exact time is shown as light brown rectangles). 内存传输也是如此(确切的时间显示为浅棕色矩形)。

Hope, I helped you to solve your problem. 希望,我帮你解决了你的问题。

Here's a riff on @JackOLantern's code that allows easy variation of the number of FFTs, FFT length, and stream count to experiment with GPU utilization in nvvp. 这是@ JackOLantern代码的一个重复段,允许轻松改变FFT的数量,FFT长度和流计数,以试验nvvp中的GPU利用率。

// Compile with:
// nvcc --std=c++11 stream_parallel.cu -o stream_parallel -lcufft

#include <iostream>

#include <cuda.h>
#include <cuda_runtime.h>

#include <cufft.h>

// Print file name, line number, and error code when a CUDA error occurs.
#define check_cuda_errors(val)  __check_cuda_errors__ ( (val), #val, __FILE__, __LINE__ )

template <typename T>
inline void __check_cuda_errors__(T code, const char *func, const char *file, int line) {
    if (code) {
    std::cout << "CUDA error at "
          << file << ":" << line << std::endl
          << "error code: " << (unsigned int) code
          << " type: \""  << cudaGetErrorString(cudaGetLastError()) << "\"" << std::endl
          << "func: \"" << func << "\""
          << std::endl;

int main(int argc, char *argv[]) {

    // Number of FFTs to compute.
    const int NUM_DATA = 64;

    // Length of each FFT.
    const int N = 1048576;

    // Number of GPU streams across which to distribute the FFTs.
    const int NUM_STREAMS = 4;

    // Allocate and initialize host input data.
    float2 **h_in = new float2 *[NUM_STREAMS];
    for (int ii = 0; ii < NUM_STREAMS; ii++) {
        h_in[ii] = new float2[N];
        for (int jj = 0; jj < N; ++jj) {
            h_in[ii][jj].x = (float) 1.f;
            h_in[ii][jj].y = (float) 0.f;

    // Allocate and initialize host output data.
    float2 **h_out = new float2 *[NUM_STREAMS];
    for (int ii = 0; ii < NUM_STREAMS; ii++) {
    h_out[ii] = new float2[N];
    for (int jj = 0; jj < N; ++jj) {
            h_out[ii][jj].x = 0.f;
            h_out[ii][jj].y = 0.f;

    // Pin host input and output memory for cudaMemcpyAsync.
    for (int ii = 0; ii < NUM_STREAMS; ii++) {
        check_cuda_errors(cudaHostRegister(h_in[ii], N*sizeof(float2), cudaHostRegisterPortable));
        check_cuda_errors(cudaHostRegister(h_out[ii], N*sizeof(float2), cudaHostRegisterPortable));

    // Allocate pointers to device input and output arrays.
    float2 **d_in = new float2 *[NUM_STREAMS];
    float2 **d_out = new float2 *[NUM_STREAMS];

    // Allocate intput and output arrays on device.
    for (int ii = 0; ii < NUM_STREAMS; ii++) {
        check_cuda_errors(cudaMalloc((void**)&d_in[ii], N*sizeof(float2)));
        check_cuda_errors(cudaMalloc((void**)&d_out[ii], N*sizeof(float2)));

    // Create CUDA streams.
    cudaStream_t streams[NUM_STREAMS];
    for (int ii = 0; ii < NUM_STREAMS; ii++) {

    // Creates cuFFT plans and sets them in streams
    cufftHandle* plans = (cufftHandle*) malloc(sizeof(cufftHandle)*NUM_STREAMS);
    for (int ii = 0; ii < NUM_STREAMS; ii++) {
        cufftPlan1d(&plans[ii], N, CUFFT_C2C, 1);
        cufftSetStream(plans[ii], streams[ii]);

    // Fill streams with async memcopies and FFTs.
    for (int ii = 0; ii < NUM_DATA; ii++) {
        int jj = ii % NUM_STREAMS;
        check_cuda_errors(cudaMemcpyAsync(d_in[jj], h_in[jj], N*sizeof(float2), cudaMemcpyHostToDevice, streams[jj]));
        cufftExecC2C(plans[jj], (cufftComplex*)d_in[jj], (cufftComplex*)d_out[jj], CUFFT_FORWARD);
        check_cuda_errors(cudaMemcpyAsync(h_out[jj], d_out[jj], N*sizeof(float2), cudaMemcpyDeviceToHost, streams[jj]));

    // Wait for calculations to complete.
    for(int ii = 0; ii < NUM_STREAMS; ii++) {

    // Free memory and streams.
    for (int ii = 0; ii < NUM_STREAMS; ii++) {
        delete[] h_in[ii];
        delete[] h_out[ii];

    delete plans;


    return 0;

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

粤ICP备18138465号  © 2020-2024 STACKOOM.COM