二维数组的 cudaMalloc 与 cudaMalloc3D 性能

Question

I want to know the impact on performance when using cudaMalloc or cudaMalloc3D when allocating, copying and accessing memory for a 2D array.我想知道在为二维数组分配、复制和访问内存时使用 cudaMalloc 或 cudaMalloc3D 对性能的影响。 I have code that I tried to test the run time on where on one I use cudaMalloc and on the other cudaMalloc3D.我有一些代码，我试图在我使用 cudaMalloc 的地方和另一个使用 cudaMalloc3D 的地方测试运行时间。 I have included the code below.我已经包含了下面的代码。 An explanation on how the performance is impacted by either api would be much appreciated.非常感谢有关性能如何受任一 api 影响的解释。

cudaMalloc code: cudaMalloc 代码：

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

#define PI 3.14159265
#define NX 8192     /* includes boundary points on both end */
#define NY 4096     /* includes boundary points on both end */

#define N_THREADS_X 16
#define N_THREADS_Y 16
#define N_BLOCKS_X NX/N_THREADS_X 
#define N_BLOCKS_Y NY/N_THREADS_Y 

#define LX 4.0    /* length of the domain in x-direction  */
#define LY 2.0    /* length of the domain in x-direction  */
#define dx       (REAL) ( LX/( (REAL) (NX) ) )
#define cSqrd     5.0
#define dt       (REAL) ( 0.4 * dx / sqrt(cSqrd) )
#define FACTOR   ( cSqrd * (dt*dt)/(dx*dx) )

#define IC  (i + j*NX)       /* (i,j)   */
#define IM1 (i + j*NX - 1)   /* (i-1,j) */
#define IP1 (i + j*NX + 1)   /* (i+1,j) */
#define JM1 (i + (j-1)*NX)   /* (i,j-1) */
#define JP1 (i + (j+1)*NX)   /* (i,j+1) */


#define cudaCheckError() {\
  cudaError_t e = cudaGetLastError() ; \
  if( e != cudaSuccess ) {\
    printf("\nCuda Failure %s:%d: %s\n",__FILE__,__LINE__,cudaGetErrorString(e));\
    exit(EXIT_FAILURE);\
  }\
}

typedef double REAL;
typedef int    INT;                

__global__ void solveWaveGPU ( REAL *uold, REAL *u, REAL *unew )
{

 INT i,j;

 i = blockIdx.x*blockDim.x + threadIdx.x;
 j = blockIdx.y*blockDim.y + threadIdx.y;

 if (i>0 && i < (NX-1) && j>0 && j < (NY-1) ) {

    unew[IC] = 2.0*u[IC] - uold[IC] + FACTOR*( u[IP1] + u[IM1] + u[JP1] + u[JM1] - 4.0*u[IC] );

 }
} 

void initWave ( REAL *unew, REAL *u, REAL *uold, REAL *x, REAL *y )
{                    

    INT i,j;

    for (j=1; j<NY-1; j++) {
        for (i=1; i<NX-1; i++) {
            u[IC] =  0.1 * (4.0*x[IC]-x[IC]*x[IC]) * ( 2.0*y[IC] - y[IC]*y[IC] );
        }
    }
    for (j=1; j<NY-1; j++) {
        for (i=1; i<NX-1; i++) {
            uold[IC] = u[IC] + 0.5*FACTOR*( u[IP1] + u[IM1] + u[JP1] + u[JM1] - 4.0*u[IC] );
        }
    }
}

void meshGrid ( REAL *x, REAL *y )
{

    INT i,j;
    REAL a;

    for (j=0; j<NY; j++) {
        a = dx * ( (REAL) j );
        for (i=0; i<NX; i++) {
            x[IC] =  dx * ( (REAL) i );
            y[IC] = a;
        }
     }
}

INT main(INT argc, char *argv[])
{ 

    INT nTimeSteps = 100;

    REAL *unew,   *u,   *uold,   *uFinal, *x, *y; //pointers for the host side
    REAL *d_unew, *d_u, *d_uold, *tmp;                 //pointers for the device

//  variable declaration for timing
    cudaEvent_t timeStart, timeStop;
    cudaEventCreate(&timeStart);
    cudaEventCreate(&timeStop);
    float elapsedTime_gpu;

    unew           = (REAL *)calloc(NX*NY,sizeof(REAL));
    u              = (REAL *)calloc(NX*NY,sizeof(REAL));
    uold           = (REAL *)calloc(NX*NY,sizeof(REAL));
    uFinal         = (REAL *)calloc(NX*NY,sizeof(REAL));
    x              = (REAL *)calloc(NX*NY,sizeof(REAL));
    y              = (REAL *)calloc(NX*NY,sizeof(REAL));

// create device copies of the variables

    cudaMalloc( (void**) &d_unew,  NX*NY*sizeof(REAL) ); cudaCheckError(); 
    cudaMalloc( (void**) &d_u,     NX*NY*sizeof(REAL) ); cudaCheckError();
    cudaMalloc( (void**) &d_uold,  NX*NY*sizeof(REAL) ); cudaCheckError();

    meshGrid( x, y );
    initWave( unew, u, uold, x, y );

//  start timing the GPU

    cudaMemcpy( d_u, u,       NX*NY*sizeof(REAL), cudaMemcpyHostToDevice ); cudaCheckError();
    cudaMemcpy( d_uold, uold, NX*NY*sizeof(REAL), cudaMemcpyHostToDevice ); cudaCheckError();
    cudaMemcpy( d_unew, unew, NX*NY*sizeof(REAL), cudaMemcpyHostToDevice ); cudaCheckError();

//  set up the GPU grid/block model

    dim3 dimGrid  ( N_BLOCKS_X , N_BLOCKS_Y  );
    dim3 dimBlock ( N_THREADS_X, N_THREADS_Y );  

//  launch the GPU kernel

    cudaEventRecord(timeStart, 0);

    for (INT n=1; n<nTimeSteps+1; n++) {

       solveWaveGPU <<<dimGrid,dimBlock>>>(d_uold, d_u, d_unew);
       cudaDeviceSynchronize();
       cudaCheckError();

       tmp    = d_uold;
       d_uold = d_u;
       d_u    = d_unew;
       d_unew = tmp;

    }
    cudaEventRecord(timeStop, 0);
    cudaEventSynchronize(timeStop);
    cudaEventElapsedTime(&elapsedTime_gpu, timeStart, timeStop);

    cudaMemcpy( uFinal, d_u, NX*NY*sizeof(REAL), cudaMemcpyDeviceToHost ); cudaCheckError();

    printf("elapsedTime on the GPU= %f s.\n", elapsedTime_gpu/1000.0);

    free(unew);       free(u);       free(uold);
    cudaFree(d_unew); cudaFree(d_u); cudaFree(d_uold);
    free(uFinal); free(x); free(y);

    cudaEventDestroy(timeStart);
    cudaEventDestroy(timeStop);

    return (0);
}

cudaMalloc3D code: cudaMalloc3D 代码：

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

#define PI 3.14159265
#define NX 8192     /* includes boundary points on both end */
#define NY 4096     /* includes boundary points on both end */
#define NZ 1        /* needed for cudaMalloc3D */

#define N_THREADS_X 16
#define N_THREADS_Y 16
#define N_BLOCKS_X NX/N_THREADS_X 
#define N_BLOCKS_Y NY/N_THREADS_Y 

#define LX 4.0    /* length of the domain in x-direction  */
#define LY 2.0    /* length of the domain in x-direction  */
#define dx       (REAL) ( LX/( (REAL) (NX) ) )
#define cSqrd     5.0
#define dt       (REAL) ( 0.4 * dx / sqrt(cSqrd) )
#define FACTOR   ( cSqrd * (dt*dt)/(dx*dx) )

#define IC  (i + j*NX)       /* (i,j)   */
#define IM1 (i + j*NX - 1)   /* (i-1,j) */
#define IP1 (i + j*NX + 1)   /* (i+1,j) */
#define JM1 (i + (j-1)*NX)   /* (i,j-1) */
#define JP1 (i + (j+1)*NX)   /* (i,j+1) */

#define cudaCheckError() {\
  cudaError_t e = cudaGetLastError() ; \
  if( e != cudaSuccess ) {\
    printf("\nCuda Failure %s:%d: %s\n",__FILE__,__LINE__,cudaGetErrorString(e));\
    exit(EXIT_FAILURE);\
  }\
}

typedef double REAL;
typedef int    INT;


__global__ void solveWaveGPU ( cudaPitchedPtr uold, cudaPitchedPtr u, cudaPitchedPtr unew )
{

 INT i,j;

 i = blockIdx.x*blockDim.x + threadIdx.x;
 j = blockIdx.y*blockDim.y + threadIdx.y;

 if (i>0 && i < (NX-1) && j>0 && j < (NY-1) ) {

    char *d_u    = (char *) u.ptr;
    char *d_uold = (char *) uold.ptr;
    char *d_unew = (char *) unew.ptr;

    REAL *u_row = (REAL *)(d_u + j * u.pitch);
    REAL u_IP1  = ( (REAL *)(d_u + (j+1) * u.pitch) )[i];
    REAL u_IM1  = ( (REAL *)(d_u + (j-1) * u.pitch) )[i];
    REAL u_JP1  = u_row[i+1];
    REAL u_JM1  = u_row[i-1];
    REAL u_IC   = u_row[i];

    REAL uold_IC  = ( (REAL *)(d_uold + j * uold.pitch) )[i];
    REAL *unew_row = (REAL *)(d_unew + j * unew.pitch);

    unew_row[i] = 2.0 * u_IC - uold_IC + FACTOR * ( u_IP1 + u_IM1 + u_JP1 + u_JM1 - 4.0 * u_IC );

 }

}                  

void initWave ( REAL *unew, REAL *u, REAL *uold, REAL *x, REAL *y )
{                    

    INT i,j;

    for (j=1; j<NY-1; j++) {
        for (i=1; i<NX-1; i++) {
            u[IC] =  0.1 * (4.0*x[IC]-x[IC]*x[IC]) * ( 2.0*y[IC] - y[IC]*y[IC] );
        }
    }
    for (j=1; j<NY-1; j++) {
        for (i=1; i<NX-1; i++) {
            uold[IC] = u[IC] + 0.5*FACTOR*( u[IP1] + u[IM1] + u[JP1] + u[JM1] - 4.0*u[IC] );
        }
    }
}

void meshGrid ( REAL *x, REAL *y )
{

    INT i,j;
    REAL a;

    for (j=0; j<NY; j++) {
        a = dx * ( (REAL) j );
        for (i=0; i<NX; i++) {
            x[IC] =  dx * ( (REAL) i );
            y[IC] = a;
        }
     }
}

INT main(INT argc, char *argv[])
{
    INT nTimeSteps = 100;
    REAL *unew, *u, *uold, *uFinal, *x, *y; //pointers for the host side

//  variable declaration for timing    
    cudaEvent_t timeStart, timeStop;
    cudaEventCreate(&timeStart);
    cudaEventCreate(&timeStop);
    float elapsedTime_gpu;

    unew           = (REAL *)calloc(NX*NY,sizeof(REAL));
    u              = (REAL *)calloc(NX*NY,sizeof(REAL));
    uold           = (REAL *)calloc(NX*NY,sizeof(REAL));
    uFinal         = (REAL *)calloc(NX*NY,sizeof(REAL));
    x              = (REAL *)calloc(NX*NY,sizeof(REAL));
    y              = (REAL *)calloc(NX*NY,sizeof(REAL));


    cudaExtent myExtent = make_cudaExtent(NX * sizeof(REAL), NY, NZ);
    cudaPitchedPtr d_u, d_uold, d_unew, d_tmp;

    // create device copies of the variables
    cudaMalloc3D( &d_u   , myExtent );   cudaCheckError();
    cudaMalloc3D( &d_uold, myExtent );   cudaCheckError();
    cudaMalloc3D( &d_unew, myExtent );   cudaCheckError();

    meshGrid( x, y );
    initWave( unew, u, uold, x, y );

    cudaMemcpy3DParms cpy3D = { 0 };
    cpy3D.extent = myExtent;
    cpy3D.kind   = cudaMemcpyHostToDevice;

    // copy 3D from u to d_u
    cpy3D.srcPtr = make_cudaPitchedPtr(u, NX*sizeof(REAL), NX, NY);
    cpy3D.dstPtr = d_u;
    cudaMemcpy3D( &cpy3D ); cudaCheckError();

    // copy 3D from uold to d_uold
    cpy3D.srcPtr = make_cudaPitchedPtr(uold, NX*sizeof(REAL), NX, NY);
    cpy3D.dstPtr = d_uold;
    cudaMemcpy3D( &cpy3D ); cudaCheckError();

    //  set up the GPU grid/block model
    dim3 dimGrid  ( N_BLOCKS_X , N_BLOCKS_Y  );
    dim3 dimBlock ( N_THREADS_X, N_THREADS_Y );  

    //  launch the GPU kernel
    //  start timing the GPU
    cudaEventRecord(timeStart, 0);

    for (INT n=1; n<nTimeSteps+1; n++) {

       solveWaveGPU <<<dimGrid,dimBlock>>>(d_uold, d_u, d_unew);
       cudaDeviceSynchronize();
       cudaCheckError();

       d_tmp  = d_uold;
       d_uold = d_u;
       d_u    = d_unew;
       d_unew = d_tmp;

    }
    cudaEventRecord(timeStop, 0);
    cudaEventSynchronize(timeStop);
    cudaEventElapsedTime(&elapsedTime_gpu, timeStart, timeStop);


    // copy 3D from d_u to uFinal
    cpy3D.kind   = cudaMemcpyDeviceToHost;
    cpy3D.srcPtr = d_u;
    cpy3D.dstPtr = make_cudaPitchedPtr(uFinal, NX*sizeof(REAL), NX, NY);
    cudaMemcpy3D( &cpy3D ); cudaCheckError();

    printf("elapsedTime on the GPU= %f s.\n", elapsedTime_gpu/1000.0);

    free(u);    cudaFree(d_unew.ptr);
    free(uold); cudaFree(d_u.ptr);
    free(unew); cudaFree(d_uold.ptr);
    free(uFinal); free(x); free(y);

    cudaEventDestroy(timeStart);
    cudaEventDestroy(timeStop);

    return (0);
}

Timing:定时：

cudaMalloc3D: 1.192510 s
cudaMalloc:   0.960322 s

Machine specification:机器规格：

GNU/Linux x86_64
NVIDIA GeForce GTX Titan CC: 3.5
CUDA ver 7.0

Answer 1

The performance difference you observe is mostly due to the increased instruction overhead in the pitched memory indexing scheme.您观察到的性能差异主要是由于倾斜内存索引方案中指令开销的增加。 Because your array size is a large power of two in the major direction, it is very likely that the pitched array allocated with cudaMalloc3D is the same size as the naïve allocation using cudaMalloc .由于您的数组大小在主要方向上是 2 的大幂，因此使用cudaMalloc3D分配的倾斜数组很可能与使用cudaMalloc分配大小相同。 You may find that the performance difference between the two versions changes if you vary the problem size.如果您改变问题的大小，您可能会发现两个版本之间的性能差异会发生变化。

(Take note of the comments regarding compiler regressions in CUDA 7. If you refactor your code to pass the Fourier number as a kernel parameter, you will probably get a far bigger performance change than any difference due to pitched memory). （请注意有关 CUDA 7 中编译器回归的注释。如果您重构代码以将傅立叶数作为内核参数传递，您可能会获得比由于内存倾斜引起的任何差异更大的性能变化）。

二维数组的 cudaMalloc 与 cudaMalloc3D 性能

问题描述

1 个解决方案

解决方案1
2 已采纳

二维数组的 cudaMalloc 与 cudaMalloc3D 性能

问题描述

1 个解决方案

解决方案1 2 已采纳

解决方案1
2 已采纳