cudaMalloc vs cudaMalloc3D performance for a 2D array

Question

I want to know the impact on performance when using cudaMalloc or cudaMalloc3D when allocating, copying and accessing memory for a 2D array. I have code that I tried to test the run time on where on one I use cudaMalloc and on the other cudaMalloc3D. I have included the code below. An explanation on how the performance is impacted by either api would be much appreciated.

cudaMalloc code:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

#define PI 3.14159265
#define NX 8192     /* includes boundary points on both end */
#define NY 4096     /* includes boundary points on both end */

#define N_THREADS_X 16
#define N_THREADS_Y 16
#define N_BLOCKS_X NX/N_THREADS_X 
#define N_BLOCKS_Y NY/N_THREADS_Y 

#define LX 4.0    /* length of the domain in x-direction  */
#define LY 2.0    /* length of the domain in x-direction  */
#define dx       (REAL) ( LX/( (REAL) (NX) ) )
#define cSqrd     5.0
#define dt       (REAL) ( 0.4 * dx / sqrt(cSqrd) )
#define FACTOR   ( cSqrd * (dt*dt)/(dx*dx) )

#define IC  (i + j*NX)       /* (i,j)   */
#define IM1 (i + j*NX - 1)   /* (i-1,j) */
#define IP1 (i + j*NX + 1)   /* (i+1,j) */
#define JM1 (i + (j-1)*NX)   /* (i,j-1) */
#define JP1 (i + (j+1)*NX)   /* (i,j+1) */


#define cudaCheckError() {\
  cudaError_t e = cudaGetLastError() ; \
  if( e != cudaSuccess ) {\
    printf("\nCuda Failure %s:%d: %s\n",__FILE__,__LINE__,cudaGetErrorString(e));\
    exit(EXIT_FAILURE);\
  }\
}

typedef double REAL;
typedef int    INT;                

__global__ void solveWaveGPU ( REAL *uold, REAL *u, REAL *unew )
{

 INT i,j;

 i = blockIdx.x*blockDim.x + threadIdx.x;
 j = blockIdx.y*blockDim.y + threadIdx.y;

 if (i>0 && i < (NX-1) && j>0 && j < (NY-1) ) {

    unew[IC] = 2.0*u[IC] - uold[IC] + FACTOR*( u[IP1] + u[IM1] + u[JP1] + u[JM1] - 4.0*u[IC] );

 }
} 

void initWave ( REAL *unew, REAL *u, REAL *uold, REAL *x, REAL *y )
{                    

    INT i,j;

    for (j=1; j<NY-1; j++) {
        for (i=1; i<NX-1; i++) {
            u[IC] =  0.1 * (4.0*x[IC]-x[IC]*x[IC]) * ( 2.0*y[IC] - y[IC]*y[IC] );
        }
    }
    for (j=1; j<NY-1; j++) {
        for (i=1; i<NX-1; i++) {
            uold[IC] = u[IC] + 0.5*FACTOR*( u[IP1] + u[IM1] + u[JP1] + u[JM1] - 4.0*u[IC] );
        }
    }
}

void meshGrid ( REAL *x, REAL *y )
{

    INT i,j;
    REAL a;

    for (j=0; j<NY; j++) {
        a = dx * ( (REAL) j );
        for (i=0; i<NX; i++) {
            x[IC] =  dx * ( (REAL) i );
            y[IC] = a;
        }
     }
}

INT main(INT argc, char *argv[])
{ 

    INT nTimeSteps = 100;

    REAL *unew,   *u,   *uold,   *uFinal, *x, *y; //pointers for the host side
    REAL *d_unew, *d_u, *d_uold, *tmp;                 //pointers for the device

//  variable declaration for timing
    cudaEvent_t timeStart, timeStop;
    cudaEventCreate(&timeStart);
    cudaEventCreate(&timeStop);
    float elapsedTime_gpu;

    unew           = (REAL *)calloc(NX*NY,sizeof(REAL));
    u              = (REAL *)calloc(NX*NY,sizeof(REAL));
    uold           = (REAL *)calloc(NX*NY,sizeof(REAL));
    uFinal         = (REAL *)calloc(NX*NY,sizeof(REAL));
    x              = (REAL *)calloc(NX*NY,sizeof(REAL));
    y              = (REAL *)calloc(NX*NY,sizeof(REAL));

// create device copies of the variables

    cudaMalloc( (void**) &d_unew,  NX*NY*sizeof(REAL) ); cudaCheckError(); 
    cudaMalloc( (void**) &d_u,     NX*NY*sizeof(REAL) ); cudaCheckError();
    cudaMalloc( (void**) &d_uold,  NX*NY*sizeof(REAL) ); cudaCheckError();

    meshGrid( x, y );
    initWave( unew, u, uold, x, y );

//  start timing the GPU

    cudaMemcpy( d_u, u,       NX*NY*sizeof(REAL), cudaMemcpyHostToDevice ); cudaCheckError();
    cudaMemcpy( d_uold, uold, NX*NY*sizeof(REAL), cudaMemcpyHostToDevice ); cudaCheckError();
    cudaMemcpy( d_unew, unew, NX*NY*sizeof(REAL), cudaMemcpyHostToDevice ); cudaCheckError();

//  set up the GPU grid/block model

    dim3 dimGrid  ( N_BLOCKS_X , N_BLOCKS_Y  );
    dim3 dimBlock ( N_THREADS_X, N_THREADS_Y );  

//  launch the GPU kernel

    cudaEventRecord(timeStart, 0);

    for (INT n=1; n<nTimeSteps+1; n++) {

       solveWaveGPU <<<dimGrid,dimBlock>>>(d_uold, d_u, d_unew);
       cudaDeviceSynchronize();
       cudaCheckError();

       tmp    = d_uold;
       d_uold = d_u;
       d_u    = d_unew;
       d_unew = tmp;

    }
    cudaEventRecord(timeStop, 0);
    cudaEventSynchronize(timeStop);
    cudaEventElapsedTime(&elapsedTime_gpu, timeStart, timeStop);

    cudaMemcpy( uFinal, d_u, NX*NY*sizeof(REAL), cudaMemcpyDeviceToHost ); cudaCheckError();

    printf("elapsedTime on the GPU= %f s.\n", elapsedTime_gpu/1000.0);

    free(unew);       free(u);       free(uold);
    cudaFree(d_unew); cudaFree(d_u); cudaFree(d_uold);
    free(uFinal); free(x); free(y);

    cudaEventDestroy(timeStart);
    cudaEventDestroy(timeStop);

    return (0);
}

cudaMalloc3D code:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

#define PI 3.14159265
#define NX 8192     /* includes boundary points on both end */
#define NY 4096     /* includes boundary points on both end */
#define NZ 1        /* needed for cudaMalloc3D */

#define N_THREADS_X 16
#define N_THREADS_Y 16
#define N_BLOCKS_X NX/N_THREADS_X 
#define N_BLOCKS_Y NY/N_THREADS_Y 

#define LX 4.0    /* length of the domain in x-direction  */
#define LY 2.0    /* length of the domain in x-direction  */
#define dx       (REAL) ( LX/( (REAL) (NX) ) )
#define cSqrd     5.0
#define dt       (REAL) ( 0.4 * dx / sqrt(cSqrd) )
#define FACTOR   ( cSqrd * (dt*dt)/(dx*dx) )

#define IC  (i + j*NX)       /* (i,j)   */
#define IM1 (i + j*NX - 1)   /* (i-1,j) */
#define IP1 (i + j*NX + 1)   /* (i+1,j) */
#define JM1 (i + (j-1)*NX)   /* (i,j-1) */
#define JP1 (i + (j+1)*NX)   /* (i,j+1) */

#define cudaCheckError() {\
  cudaError_t e = cudaGetLastError() ; \
  if( e != cudaSuccess ) {\
    printf("\nCuda Failure %s:%d: %s\n",__FILE__,__LINE__,cudaGetErrorString(e));\
    exit(EXIT_FAILURE);\
  }\
}

typedef double REAL;
typedef int    INT;


__global__ void solveWaveGPU ( cudaPitchedPtr uold, cudaPitchedPtr u, cudaPitchedPtr unew )
{

 INT i,j;

 i = blockIdx.x*blockDim.x + threadIdx.x;
 j = blockIdx.y*blockDim.y + threadIdx.y;

 if (i>0 && i < (NX-1) && j>0 && j < (NY-1) ) {

    char *d_u    = (char *) u.ptr;
    char *d_uold = (char *) uold.ptr;
    char *d_unew = (char *) unew.ptr;

    REAL *u_row = (REAL *)(d_u + j * u.pitch);
    REAL u_IP1  = ( (REAL *)(d_u + (j+1) * u.pitch) )[i];
    REAL u_IM1  = ( (REAL *)(d_u + (j-1) * u.pitch) )[i];
    REAL u_JP1  = u_row[i+1];
    REAL u_JM1  = u_row[i-1];
    REAL u_IC   = u_row[i];

    REAL uold_IC  = ( (REAL *)(d_uold + j * uold.pitch) )[i];
    REAL *unew_row = (REAL *)(d_unew + j * unew.pitch);

    unew_row[i] = 2.0 * u_IC - uold_IC + FACTOR * ( u_IP1 + u_IM1 + u_JP1 + u_JM1 - 4.0 * u_IC );

 }

}                  

void initWave ( REAL *unew, REAL *u, REAL *uold, REAL *x, REAL *y )
{                    

    INT i,j;

    for (j=1; j<NY-1; j++) {
        for (i=1; i<NX-1; i++) {
            u[IC] =  0.1 * (4.0*x[IC]-x[IC]*x[IC]) * ( 2.0*y[IC] - y[IC]*y[IC] );
        }
    }
    for (j=1; j<NY-1; j++) {
        for (i=1; i<NX-1; i++) {
            uold[IC] = u[IC] + 0.5*FACTOR*( u[IP1] + u[IM1] + u[JP1] + u[JM1] - 4.0*u[IC] );
        }
    }
}

void meshGrid ( REAL *x, REAL *y )
{

    INT i,j;
    REAL a;

    for (j=0; j<NY; j++) {
        a = dx * ( (REAL) j );
        for (i=0; i<NX; i++) {
            x[IC] =  dx * ( (REAL) i );
            y[IC] = a;
        }
     }
}

INT main(INT argc, char *argv[])
{
    INT nTimeSteps = 100;
    REAL *unew, *u, *uold, *uFinal, *x, *y; //pointers for the host side

//  variable declaration for timing    
    cudaEvent_t timeStart, timeStop;
    cudaEventCreate(&timeStart);
    cudaEventCreate(&timeStop);
    float elapsedTime_gpu;

    unew           = (REAL *)calloc(NX*NY,sizeof(REAL));
    u              = (REAL *)calloc(NX*NY,sizeof(REAL));
    uold           = (REAL *)calloc(NX*NY,sizeof(REAL));
    uFinal         = (REAL *)calloc(NX*NY,sizeof(REAL));
    x              = (REAL *)calloc(NX*NY,sizeof(REAL));
    y              = (REAL *)calloc(NX*NY,sizeof(REAL));


    cudaExtent myExtent = make_cudaExtent(NX * sizeof(REAL), NY, NZ);
    cudaPitchedPtr d_u, d_uold, d_unew, d_tmp;

    // create device copies of the variables
    cudaMalloc3D( &d_u   , myExtent );   cudaCheckError();
    cudaMalloc3D( &d_uold, myExtent );   cudaCheckError();
    cudaMalloc3D( &d_unew, myExtent );   cudaCheckError();

    meshGrid( x, y );
    initWave( unew, u, uold, x, y );

    cudaMemcpy3DParms cpy3D = { 0 };
    cpy3D.extent = myExtent;
    cpy3D.kind   = cudaMemcpyHostToDevice;

    // copy 3D from u to d_u
    cpy3D.srcPtr = make_cudaPitchedPtr(u, NX*sizeof(REAL), NX, NY);
    cpy3D.dstPtr = d_u;
    cudaMemcpy3D( &cpy3D ); cudaCheckError();

    // copy 3D from uold to d_uold
    cpy3D.srcPtr = make_cudaPitchedPtr(uold, NX*sizeof(REAL), NX, NY);
    cpy3D.dstPtr = d_uold;
    cudaMemcpy3D( &cpy3D ); cudaCheckError();

    //  set up the GPU grid/block model
    dim3 dimGrid  ( N_BLOCKS_X , N_BLOCKS_Y  );
    dim3 dimBlock ( N_THREADS_X, N_THREADS_Y );  

    //  launch the GPU kernel
    //  start timing the GPU
    cudaEventRecord(timeStart, 0);

    for (INT n=1; n<nTimeSteps+1; n++) {

       solveWaveGPU <<<dimGrid,dimBlock>>>(d_uold, d_u, d_unew);
       cudaDeviceSynchronize();
       cudaCheckError();

       d_tmp  = d_uold;
       d_uold = d_u;
       d_u    = d_unew;
       d_unew = d_tmp;

    }
    cudaEventRecord(timeStop, 0);
    cudaEventSynchronize(timeStop);
    cudaEventElapsedTime(&elapsedTime_gpu, timeStart, timeStop);


    // copy 3D from d_u to uFinal
    cpy3D.kind   = cudaMemcpyDeviceToHost;
    cpy3D.srcPtr = d_u;
    cpy3D.dstPtr = make_cudaPitchedPtr(uFinal, NX*sizeof(REAL), NX, NY);
    cudaMemcpy3D( &cpy3D ); cudaCheckError();

    printf("elapsedTime on the GPU= %f s.\n", elapsedTime_gpu/1000.0);

    free(u);    cudaFree(d_unew.ptr);
    free(uold); cudaFree(d_u.ptr);
    free(unew); cudaFree(d_uold.ptr);
    free(uFinal); free(x); free(y);

    cudaEventDestroy(timeStart);
    cudaEventDestroy(timeStop);

    return (0);
}

Timing:

cudaMalloc3D: 1.192510 s
cudaMalloc:   0.960322 s

Machine specification:

GNU/Linux x86_64
NVIDIA GeForce GTX Titan CC: 3.5
CUDA ver 7.0

Answer 1

The performance difference you observe is mostly due to the increased instruction overhead in the pitched memory indexing scheme. Because your array size is a large power of two in the major direction, it is very likely that the pitched array allocated with cudaMalloc3D is the same size as the naïve allocation using cudaMalloc . You may find that the performance difference between the two versions changes if you vary the problem size.

(Take note of the comments regarding compiler regressions in CUDA 7. If you refactor your code to pass the Fourier number as a kernel parameter, you will probably get a far bigger performance change than any difference due to pitched memory).

cudaMalloc vs cudaMalloc3D performance for a 2D array

Question

1 answers

solution1
2 ACCPTED

cudaMalloc vs cudaMalloc3D performance for a 2D array

Question

1 answers

solution1 2 ACCPTED

solution1
2 ACCPTED