繁体   English   中英

如何在多个GPU上同时执行cufftXt和CUDA内核?

[英]how do i execute both cufftXt and CUDA kernels on multiple GPUs?

我想使用两个GPU执行一个内核,然后使用cufftXt执行一个FFT。 数据大小可能为几个GB。 我对在2个GPU上为内核分配内存的理解是,您应该将主机阵列一分为二,然后将前半部分发送给GPU0,另一半发送给GPU1。 以下示例显示了如何完成此操作。

#include <iostream>
#define _USE_MATH_DEFINES
#include <math.h>
#include <ctime>
#include <fstream>
#include <sstream>
#include <cstdlib>
#include <string>
#include <stdlib.h>
#include <stdio.h>

#include <cuda_runtime.h>
#include <cufft.h>
#include <cufftXt.h>

using namespace std;

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

__global__ void Cube (cufftReal *data, cufftReal *data3, int N, int real_size) {

    int i = blockIdx.x * blockDim.x + threadIdx.x;

    if (i<real_size){
    float x = (i % (N+2));
    if(x < N){
            data3[i] = pow(data[i], 3.0f);  
    }
    else{
            data3[i] = 0.0f;        
    }
    }

    __syncthreads();
}


int main (int argc, char **argv) {

    int x;
    int N = 8;

        int cplx_size = N * (N/2 + 1);
        int real_size = 2 * cplx_size;
    int mem_size = sizeof(cufftReal)*real_size;
    int half_real_size = real_size/2;
    int half_mem_size = mem_size/2;

    cufftReal *h_data = (cufftReal*)malloc(mem_size);
    cufftReal *h_data3 = (cufftReal*)malloc(mem_size);

    cufftReal *h0_data = (cufftReal*)malloc(half_mem_size);
    cufftReal *h0_data3 = (cufftReal*)malloc(half_mem_size);
    cufftReal *h1_data = (cufftReal*)malloc(half_mem_size);
    cufftReal *h1_data3 = (cufftReal*)malloc(half_mem_size);

    for(int i=0; i<real_size; i++){
            x = (i % (N+2));
        if(x < N){h_data[i] = 2;}
        else{h_data[i] = 0;}
    }

    for(int i=0; i<half_real_size; i++){
        h0_data[i] = h_data[i];
        h1_data[i] = h_data[i+half_real_size];
    }

    cufftReal *d0_data;
    cufftReal *d0_data3;
    cufftReal *d1_data;
    cufftReal *d1_data3;

    cudaSetDevice(0);
    gpuErrchk(cudaMalloc((void**)&d0_data, half_mem_size));
    gpuErrchk(cudaMalloc((void**)&d0_data3, half_mem_size));
    cudaSetDevice(1);
    gpuErrchk(cudaMalloc((void**)&d1_data, half_mem_size));
    gpuErrchk(cudaMalloc((void**)&d1_data3, half_mem_size));

cout <<"device memory allocated" <<endl;

    int maxThreads=(N>1024)?1024:N;
    int threadsPerBlock = maxThreads;
    int numBlocks = (half_real_size)/threadsPerBlock;

    cudaSetDevice(0);
    gpuErrchk(cudaMemcpy(d0_data, h0_data, half_mem_size, cudaMemcpyHostToDevice));
    cudaSetDevice(1);
    gpuErrchk(cudaMemcpy(d1_data, h1_data, half_mem_size, cudaMemcpyHostToDevice));

cout <<"mem copied to devices" <<endl;

        cudaSetDevice(0);
        Cube <<<numBlocks, threadsPerBlock>>> (d0_data, d0_data3, N, half_real_size);
        gpuErrchk( cudaPeekAtLastError() );
        gpuErrchk( cudaDeviceSynchronize() );
        cudaSetDevice(1);
        Cube <<<numBlocks, threadsPerBlock>>> (d1_data, d1_data3, N, half_real_size);
        gpuErrchk( cudaPeekAtLastError() );
        gpuErrchk( cudaDeviceSynchronize() );

    cudaSetDevice(0);
    gpuErrchk(cudaMemcpy(h0_data3, d0_data3, half_mem_size, cudaMemcpyDeviceToHost));
    cudaSetDevice(1);
    gpuErrchk(cudaMemcpy(h1_data3, d1_data3, half_mem_size, cudaMemcpyDeviceToHost));   

    cout <<endl;
    for(int i = 0; i<half_real_size; i++){
        cout <<h0_data3[i] <<" ";
    }
    cout <<endl;
    for(int i = 0; i<half_real_size; i++){
        cout <<h1_data3[i] <<" ";
    }

    //clean up
    cudaFree(d0_data);
    cudaFree(d0_data3);
    cudaFree(d1_data);
    cudaFree(d1_data3);   
    return 0;
}

但是,我看不到这种方法与cufftXt如何兼容。 看来我应该使用辅助函数cufftXtMemcpy将数据自动拆分到设备上。 但是,如果这样做,那么除非我为cufftXt和内核分配单独的设备内存,否则上面显示的multi-gpu内核方法将无法使用。 有没有办法在不加倍分配设备内存的情况下同时运行cufftXt和内核?

这是我按照工具箱中的simpleCUFFT_2d_MGPU代码示例进行的操作。 我不确定这是否完全正确。 与仅使用1个GPU相比,它在2个GPU上的速度要慢50%。我在Tesla K40 GPU上测试了此代码(与使用R2C和C2R FFT的另一个代码相比)。

#include <iostream>
#define _USE_MATH_DEFINES
#include <math.h>
#include <ctime>
#include <fstream>
#include <sstream>
#include <cstdlib>
#include <string>
#include <stdlib.h>
#include <stdio.h>

#include <cuda_runtime.h>
#include <cufft.h>
#include <cufftXt.h>

using namespace std;

__global__ void Cube (cufftComplex *data, cufftComplex *data3, int N, int n, int nGPUs) {

    int i = blockIdx.x * blockDim.x + threadIdx.x;   

    if (i<n){
        data3[i].x = pow(data[i].x, 3.0f);  
    data3[i].y = 0;
    }

    __syncthreads();
}

__global__ void Normalize (cufftComplex *data, int N, int n, int nGPUs){

    int i = blockIdx.x * blockDim.x + threadIdx.x;   

    if (i<n){
    data[i].x /= n;
    }

    __syncthreads();
}

int main (int argc, char **argv) {

    int x, y;
    int N = 8192;
    int n = N*N;

        //int cplx_size = N * (N/2 + 1);
        //int real_size = 2 * cplx_size;
    int mem_size = sizeof(cufftComplex)*n;

    int maxThreads=(N>1024)?1024:N;
    int threadsPerBlock = maxThreads;
    int numBlocks = (n)/threadsPerBlock;
    cout <<"numBlocks " <<numBlocks <<endl;

    cufftComplex *h_data; 
    h_data = (cufftComplex*)malloc(mem_size);
    cufftComplex *h_data3 = (cufftComplex*)malloc(mem_size);

cout <<"host data allocated" <<endl;

    int index;
    float lambda = N*.1;
    for(y=0; y<N; y++){
    for(x=0; x<N; x++){
        //cout <<x <<" " <<y <<endl;
        index = x + y*N;
        h_data[index].x = cos(2*M_PI*(x+y)/lambda);
        h_data[index].y = 0;
    }
    }

cout <<"host data values set" <<endl;

    cufftResult res;
    int  device;
    int nGPUs;
    cudaGetDeviceCount(&nGPUs);
    cout <<nGPUs <<" CUDA devices" <<endl;

    size_t total_mem, free_mem;
    for(int i=0; i<nGPUs; i++){
        cudaMemGetInfo(&free_mem, &total_mem);
        cout <<"GPU" <<i <<" used memory " <<(total_mem-free_mem)/pow(10,9);
    }

    int whichGPUs[nGPUs];
    for(int i=0; i<nGPUs; i++){
        whichGPUs[i]=i;
    }

cout <<"whichgpus set" <<endl;

        size_t* worksize;
        worksize =(size_t*)malloc(sizeof(size_t) * nGPUs);

cout <<"worksize set" <<endl;

    cufftHandle plan_complex;
    res = cufftCreate(&plan_complex);
        if (res != CUFFT_SUCCESS){cout <<"create plan failed" <<endl;}
    res = cufftXtSetGPUs(plan_complex, nGPUs, whichGPUs);
        if (res != CUFFT_SUCCESS){cout <<"setgpus forward failed" <<endl;}
cout <<"set gpus" <<endl;
    res = cufftMakePlan2d(plan_complex, N, N, CUFFT_C2C, worksize);
        if (res != CUFFT_SUCCESS){cout <<"make plan forward failed" <<endl;}

cout <<"plan created" <<endl;

    cudaLibXtDesc *d_data; 
    cudaLibXtDesc *d_data3;

    res = cufftXtMalloc(plan_complex, (cudaLibXtDesc **)&d_data, CUFFT_XT_FORMAT_INPLACE);
        if (res != CUFFT_SUCCESS){cout <<"data malloc failed" <<endl;}

    res = cufftXtMalloc(plan_complex, (cudaLibXtDesc **)&d_data3, CUFFT_XT_FORMAT_INPLACE);
        if (res != CUFFT_SUCCESS){cout <<"data3 malloc failed" <<endl;}

cout <<"xtmalloc done" <<endl;

    res = cufftXtMemcpy (plan_complex, d_data, h_data, CUFFT_COPY_HOST_TO_DEVICE);
        if (res != CUFFT_SUCCESS){cout <<"memcpy to device failed" <<endl;}

cout <<"memcpy h to d" <<endl;

int tmax = 10000;
int start = time(0);

for(int tau=0; tau<tmax; tau++){

    res = cufftXtExecDescriptorC2C(plan_complex, d_data, d_data, CUFFT_FORWARD);
        if (res != CUFFT_SUCCESS){cout <<"cufftXtExec failed" <<endl; return 0;}

    res = cufftXtExecDescriptorC2C(plan_complex, d_data, d_data, CUFFT_INVERSE);
        if (res != CUFFT_SUCCESS){cout <<"cufftXtExec failed" <<endl; return 0;}

    for(int i=0; i<nGPUs; i++){
        device = d_data->descriptor->GPUs[i];
        cudaSetDevice(device);
        Normalize <<<numBlocks, threadsPerBlock>>> ((cufftComplex*) d_data->descriptor->data[i], N, n, nGPUs);
    }

    cudaDeviceSynchronize();

}

int stop = time(0);
cout <<tmax <<" timesteps" <<endl <<(stop-start) <<" seconds"<<endl;

/*
    for(int i=0; i<nGPUs; i++){
        device = d_data->descriptor->GPUs[i];
        cudaSetDevice(device);
        Cube <<<numBlocks, threadsPerBlock>>> ((cufftComplex*) d_data->descriptor->data[i], (cufftComplex*) d_data3->descriptor->data[i], N, real_size);
    }
*/

/*
    cudaDeviceSynchronize();

    res = cufftXtMemcpy (plan_complex, h_data, d_data, CUFFT_COPY_DEVICE_TO_HOST);
        if (res != CUFFT_SUCCESS){cout <<"memcpy to host failed" <<endl;}

cout <<"memcpy d to h" <<endl;

    ofstream fout;
    ostringstream outstr;
    outstr.precision(4);
    outstr <<time(0) <<".dat";
    string filename=outstr.str();
    fout.open(filename.c_str());
    fout.precision(4);
    for (int i = 0; i < n; i++) {
        x = (i % (N));
            y = (i /(N))%N; 
            fout <<x <<" " <<y <<" " <<h_data[i].x <<endl;
    }
    fout.close();  
*/

    //clean up
    res = cufftXtFree(d_data);
        if (res != CUFFT_SUCCESS){cout <<"free data failed" <<endl;}
    res = cufftXtFree(d_data3);
        if (res != CUFFT_SUCCESS){cout <<"free data3 failed" <<endl;}
    cufftDestroy(plan_complex);

    return 0;
}

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM