簡體   English   中英

cuFFT和流

[英]cuFFT and streams

我正在嘗試使用流異步啟動多個CUDA FFT內核。 為此,我正在創建我的流,cuFFT前向和反向計划如下:

streams = (cudaStream_t*) malloc(sizeof(cudaStream_t)*streamNum);
plansF = (cufftHandle *) malloc(sizeof(cufftHandle)*streamNum);
plansI = (cufftHandle *) malloc(sizeof(cufftHandle)*streamNum);
for(int i=0; i<streamNum; i++)  
{
    cudaStreamCreate(&streams[i]);
    CHECK_ERROR(5)
    cufftPlan1d(&plansF[i], ticks, CUFFT_R2C,1);
    CHECK_ERROR(5)
    cufftPlan1d(&plansI[i], ticks, CUFFT_C2R,1);
    CHECK_ERROR(5)
    cufftSetStream(plansF[i],streams[i]);
    CHECK_ERROR(5)
    cufftSetStream(plansI[i],streams[i]);
    CHECK_ERROR(5)
}

main函數中,我正在啟動正向FFT,如下所示:

for(w=1;w<q;w++)
  {
    cufftExecR2C(plansF[w], gpuMem1+k,gpuMem2+j);
    CHECK_ERROR(8)
    k += rect_small_real;
    j += rect_small_complex;
  }

我還有其他內核,我使用相同的流異步啟動。

當我使用Visual Profiler 5.0分析我的應用程序時,我發現除了CUDA FFT(正向和反向)之外的所有內核並行運行並重疊。 FFT內核確實在不同的流中運行,但它們不重疊,因為它們實際上是順序運行的。 誰能告訴我我的問題是什么?

我的環境是VS 2008,64位,Windows 7。

謝謝。

這是在Kepler體系結構中使用CUDA中的流的cuFFT執行和memcopies的工作示例。

這是代碼:

#include <stdio.h>

#include <cufft.h>

#define NUM_STREAMS 3

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

/********/
/* MAIN */
/********/
int main()
{
    const int N = 5000;

    // --- Host input data initialization
    float2 *h_in1 = new float2[N];
    float2 *h_in2 = new float2[N];
    float2 *h_in3 = new float2[N];
    for (int i = 0; i < N; i++) {
        h_in1[i].x = 1.f;
        h_in1[i].y = 0.f;
        h_in2[i].x = 1.f;
        h_in2[i].y = 0.f;
        h_in3[i].x = 1.f;
        h_in3[i].y = 0.f;
    }

    // --- Host output data initialization
    float2 *h_out1 = new float2[N];
    float2 *h_out2 = new float2[N];
    float2 *h_out3 = new float2[N];
    for (int i = 0; i < N; i++) {
        h_out1[i].x = 0.f;
        h_out1[i].y = 0.f;
        h_out2[i].x = 0.f;
        h_out2[i].y = 0.f;
        h_out3[i].x = 0.f;
        h_out3[i].y = 0.f;
    }

    // --- Registers host memory as page-locked (required for asynch cudaMemcpyAsync)
    gpuErrchk(cudaHostRegister(h_in1, N*sizeof(float2), cudaHostRegisterPortable));
    gpuErrchk(cudaHostRegister(h_in2, N*sizeof(float2), cudaHostRegisterPortable));
    gpuErrchk(cudaHostRegister(h_in3, N*sizeof(float2), cudaHostRegisterPortable));
    gpuErrchk(cudaHostRegister(h_out1, N*sizeof(float2), cudaHostRegisterPortable));
    gpuErrchk(cudaHostRegister(h_out2, N*sizeof(float2), cudaHostRegisterPortable));
    gpuErrchk(cudaHostRegister(h_out3, N*sizeof(float2), cudaHostRegisterPortable));

    // --- Device input data allocation
    float2 *d_in1;          gpuErrchk(cudaMalloc((void**)&d_in1, N*sizeof(float2)));
    float2 *d_in2;          gpuErrchk(cudaMalloc((void**)&d_in2, N*sizeof(float2)));
    float2 *d_in3;          gpuErrchk(cudaMalloc((void**)&d_in3, N*sizeof(float2)));
    float2 *d_out1;         gpuErrchk(cudaMalloc((void**)&d_out1, N*sizeof(float2)));
    float2 *d_out2;         gpuErrchk(cudaMalloc((void**)&d_out2, N*sizeof(float2)));
    float2 *d_out3;         gpuErrchk(cudaMalloc((void**)&d_out3, N*sizeof(float2)));

    // --- Creates CUDA streams
    cudaStream_t streams[NUM_STREAMS];
    for (int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamCreate(&streams[i]));

    // --- Creates cuFFT plans and sets them in streams
    cufftHandle* plans = (cufftHandle*) malloc(sizeof(cufftHandle)*NUM_STREAMS);
    for (int i = 0; i < NUM_STREAMS; i++) {
        cufftPlan1d(&plans[i], N, CUFFT_C2C, 1);
        cufftSetStream(plans[i], streams[i]);
    }

    // --- Async memcopyes and computations
    gpuErrchk(cudaMemcpyAsync(d_in1, h_in1, N*sizeof(float2), cudaMemcpyHostToDevice, streams[0]));
    gpuErrchk(cudaMemcpyAsync(d_in2, h_in2, N*sizeof(float2), cudaMemcpyHostToDevice, streams[1]));
    gpuErrchk(cudaMemcpyAsync(d_in3, h_in3, N*sizeof(float2), cudaMemcpyHostToDevice, streams[2]));
    cufftExecC2C(plans[0], (cufftComplex*)d_in1, (cufftComplex*)d_out1, CUFFT_FORWARD);
    cufftExecC2C(plans[1], (cufftComplex*)d_in2, (cufftComplex*)d_out2, CUFFT_FORWARD);
    cufftExecC2C(plans[2], (cufftComplex*)d_in3, (cufftComplex*)d_out3, CUFFT_FORWARD);
    gpuErrchk(cudaMemcpyAsync(h_out1, d_out1, N*sizeof(float2), cudaMemcpyDeviceToHost, streams[0]));
    gpuErrchk(cudaMemcpyAsync(h_out2, d_out2, N*sizeof(float2), cudaMemcpyDeviceToHost, streams[1]));
    gpuErrchk(cudaMemcpyAsync(h_out3, d_out3, N*sizeof(float2), cudaMemcpyDeviceToHost, streams[2]));

    for(int i = 0; i < NUM_STREAMS; i++)
        gpuErrchk(cudaStreamSynchronize(streams[i]));

    // --- Releases resources
    gpuErrchk(cudaHostUnregister(h_in1));
    gpuErrchk(cudaHostUnregister(h_in2));
    gpuErrchk(cudaHostUnregister(h_in3));
    gpuErrchk(cudaHostUnregister(h_out1));
    gpuErrchk(cudaHostUnregister(h_out2));
    gpuErrchk(cudaHostUnregister(h_out3));
    gpuErrchk(cudaFree(d_in1));
    gpuErrchk(cudaFree(d_in2));
    gpuErrchk(cudaFree(d_in3));
    gpuErrchk(cudaFree(d_out1));
    gpuErrchk(cudaFree(d_out2));
    gpuErrchk(cudaFree(d_out3));

    for(int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamDestroy(streams[i]));

    delete[] h_in1;
    delete[] h_in2;
    delete[] h_in3;
    delete[] h_out1;
    delete[] h_out2;
    delete[] h_out3;

    cudaDeviceReset();  

    return 0;
}

請根據CUFFT錯誤處理添加cuFFT錯誤檢查。

下面,提供了在Kepler K20c卡上測試上述算法時的一些分析信息。 正如您將看到的,只有當您有足夠大的N ,才能實現計算和內存傳輸之間的真正重疊。

N = 5000

在此輸入圖像描述

N = 50000

在此輸入圖像描述

N = 500000

在此輸入圖像描述

問題出在你使用的硬件上。

所有支持CUDA的GPU都能夠同時執行內核並以兩種方式復制數據。 但是,只有具有Compute Capability 3.5的設備才具有名為Hyper-Q的功能。

簡而言之,在這些GPU中實現了幾個(我認為是16個)硬件內核隊列。 在之前的GPU中,只有一個硬件隊列可用。

這意味着cudaStreams只是虛擬的,只有在重疊計算和內存復制的情況下,它們對舊硬件的使用才有意義。 當然,這不僅適用於cuFFT,也適用於您自己的內核!

請深入了解visual profiler的輸出。 您可能會無意中將時間線可視化視為GPU執行的確切數據。 然而,事情並非那么簡單。 有幾行顯示的數據可能指的是執行內核啟動線的時間點(通常是橙色的)。 此行對應於GPU上的特定內核(藍色矩形)的執行。 內存傳輸也是如此(確切的時間顯示為淺棕色矩形)。

希望,我幫你解決了你的問題。

這是@ JackOLantern代碼的一個重復段,允許輕松改變FFT的數量,FFT長度和流計數,以試驗nvvp中的GPU利用率。

// Compile with:
// nvcc --std=c++11 stream_parallel.cu -o stream_parallel -lcufft

#include <iostream>

#include <cuda.h>
#include <cuda_runtime.h>

#include <cufft.h>

// Print file name, line number, and error code when a CUDA error occurs.
#define check_cuda_errors(val)  __check_cuda_errors__ ( (val), #val, __FILE__, __LINE__ )

template <typename T>
inline void __check_cuda_errors__(T code, const char *func, const char *file, int line) {
    if (code) {
    std::cout << "CUDA error at "
          << file << ":" << line << std::endl
          << "error code: " << (unsigned int) code
          << " type: \""  << cudaGetErrorString(cudaGetLastError()) << "\"" << std::endl
          << "func: \"" << func << "\""
          << std::endl;
    cudaDeviceReset();
    exit(EXIT_FAILURE);
    }
}

int main(int argc, char *argv[]) {

    // Number of FFTs to compute.
    const int NUM_DATA = 64;

    // Length of each FFT.
    const int N = 1048576;

    // Number of GPU streams across which to distribute the FFTs.
    const int NUM_STREAMS = 4;

    // Allocate and initialize host input data.
    float2 **h_in = new float2 *[NUM_STREAMS];
    for (int ii = 0; ii < NUM_STREAMS; ii++) {
        h_in[ii] = new float2[N];
        for (int jj = 0; jj < N; ++jj) {
            h_in[ii][jj].x = (float) 1.f;
            h_in[ii][jj].y = (float) 0.f;
        }
    }

    // Allocate and initialize host output data.
    float2 **h_out = new float2 *[NUM_STREAMS];
    for (int ii = 0; ii < NUM_STREAMS; ii++) {
    h_out[ii] = new float2[N];
    for (int jj = 0; jj < N; ++jj) {
            h_out[ii][jj].x = 0.f;
            h_out[ii][jj].y = 0.f;
        }
    }

    // Pin host input and output memory for cudaMemcpyAsync.
    for (int ii = 0; ii < NUM_STREAMS; ii++) {
        check_cuda_errors(cudaHostRegister(h_in[ii], N*sizeof(float2), cudaHostRegisterPortable));
        check_cuda_errors(cudaHostRegister(h_out[ii], N*sizeof(float2), cudaHostRegisterPortable));
    }

    // Allocate pointers to device input and output arrays.
    float2 **d_in = new float2 *[NUM_STREAMS];
    float2 **d_out = new float2 *[NUM_STREAMS];

    // Allocate intput and output arrays on device.
    for (int ii = 0; ii < NUM_STREAMS; ii++) {
        check_cuda_errors(cudaMalloc((void**)&d_in[ii], N*sizeof(float2)));
        check_cuda_errors(cudaMalloc((void**)&d_out[ii], N*sizeof(float2)));
    }

    // Create CUDA streams.
    cudaStream_t streams[NUM_STREAMS];
    for (int ii = 0; ii < NUM_STREAMS; ii++) {
        check_cuda_errors(cudaStreamCreate(&streams[ii]));
    }

    // Creates cuFFT plans and sets them in streams
    cufftHandle* plans = (cufftHandle*) malloc(sizeof(cufftHandle)*NUM_STREAMS);
    for (int ii = 0; ii < NUM_STREAMS; ii++) {
        cufftPlan1d(&plans[ii], N, CUFFT_C2C, 1);
        cufftSetStream(plans[ii], streams[ii]);
    }

    // Fill streams with async memcopies and FFTs.
    for (int ii = 0; ii < NUM_DATA; ii++) {
        int jj = ii % NUM_STREAMS;
        check_cuda_errors(cudaMemcpyAsync(d_in[jj], h_in[jj], N*sizeof(float2), cudaMemcpyHostToDevice, streams[jj]));
        cufftExecC2C(plans[jj], (cufftComplex*)d_in[jj], (cufftComplex*)d_out[jj], CUFFT_FORWARD);
        check_cuda_errors(cudaMemcpyAsync(h_out[jj], d_out[jj], N*sizeof(float2), cudaMemcpyDeviceToHost, streams[jj]));
    }

    // Wait for calculations to complete.
    for(int ii = 0; ii < NUM_STREAMS; ii++) {
        check_cuda_errors(cudaStreamSynchronize(streams[ii]));
    }

    // Free memory and streams.
    for (int ii = 0; ii < NUM_STREAMS; ii++) {
        check_cuda_errors(cudaHostUnregister(h_in[ii]));
        check_cuda_errors(cudaHostUnregister(h_out[ii]));
        check_cuda_errors(cudaFree(d_in[ii]));
        check_cuda_errors(cudaFree(d_out[ii]));
        delete[] h_in[ii];
        delete[] h_out[ii];
        check_cuda_errors(cudaStreamDestroy(streams[ii]));
    }

    delete plans;

    cudaDeviceReset();  

    return 0;
}

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM