Use of shared memory to reduce computational time of calculations inside CUDA kernel

Question

I have an image of size 1920 x 1080. I am transferring from H2D, processing and transferring back from D2H using three CUDA streams where each stream is responsible to take care of 1/3rd of total data. I am able to optimize the dimensions of block and number of threads per block by understanding the concept of SM, SP, warps. The code run satisfactorily (takes 2 ms) if it has to do simple calculations inside kernel. The simple calculation code below find the R, G and B value from source image and then place those values in the same source image.

ptr_source[numChannels*  (iw*y + x) + 0] = ptr_source[numChannels*  (iw*y + x) + 0];
ptr_source[numChannels*  (iw*y + x) + 1] = ptr_source[numChannels*  (iw*y + x) + 1];
ptr_source[numChannels*  (iw*y + x) + 2] = ptr_source[numChannels*  (iw*y + x) + 2];

But I have to perform few more calculations which are independent of all other threads then, the computational time gets increased by 6 ms which is too much for my application. I have already tried to declare the mostly used constant values inside the constant memory . The code for these calculation is shown below. In that code, I am again finding the R, G and B values. Then, I am calculating new values of R, G and B by multiplying the old values with some constants and finally I am putting these new R, G and B values again in the same source image at their corresponding positions.

__constant__ int iw = 1080;
__constant__ int ih = 1920;
__constant__ int numChannels = 3;


__global__ void cudaKernel(unsigned char *ptr_source, int numCudaStreams)
{

    // Calculate our pixel's location
    int x = (blockIdx.x * blockDim.x) + threadIdx.x;
    int y = (blockIdx.y * blockDim.y) + threadIdx.y;

    // Operate only if we are in the correct boundaries
    if (x >= 0 && x < iw && y >= 0 && y < ih / numCudaStreams)
    {

        const int index_b = numChannels*  (iw*y + x) + 0;
        const int index_g = numChannels*  (iw*y + x) + 1;
        const int index_r = numChannels*  (iw*y + x) + 2;

        //GET VALUES: get the R,G and B values from Source image
        unsigned char b_val = ptr_source[index_b];
        unsigned char g_val = ptr_source[index_g];
        unsigned char r_val = ptr_source[index_r];

        float float_r_val = ((1.574090) * (float)r_val + (0.088825) * (float)g_val + (-0.1909)  * (float)b_val);
        float float_g_val = ((-0.344198) * (float)r_val + (1.579802) * (float)g_val + (-1.677604)  * (float)b_val);
        float float_b_val = ((-1.012951) * (float)r_val + (-1.781485) * (float)g_val + (2.404436)  * (float)b_val);


        unsigned char dst_r_val = (float_r_val > 255.0f) ? 255 : static_cast<unsigned char>(float_r_val);
        unsigned char dst_g_val = (float_g_val > 255.0f) ? 255 : static_cast<unsigned char>(float_g_val);
        unsigned char dst_b_val = (float_b_val > 255.0f) ? 255 : static_cast<unsigned char>(float_b_val);

        //PUT VALUES---put the new calculated values of R,G and B
        ptr_source[index_b] = dst_b_val;
        ptr_source[index_g] = dst_g_val;
        ptr_source[index_r] = dst_r_val;

    }
}

Problem: I think that transferring the image segment (ie ptr_src ) to the shared memory will help but I am quite confused about how to do it. I mean, the scope of shared memory is for one block only so, how do I manage the transfer of image segment to the shared memory.

PS: My GPU is Quadro K2000, compute 3.0, 2 SM, 192 SP per SM.

Answer 1

I'm going to add this code without too much comment for the moment:

const int iw = 1080;
const int ih = 1920;
const int numChannels = 3;

__global__ void cudaKernel3(unsigned char *ptr_source, int n)
{
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    int stride = blockDim.x * gridDim.x;
    uchar3 * p = reinterpret_cast<uchar3 *>(ptr_source);

    for(; idx < n; idx+=stride) {

        uchar3 vin = p[idx];
        unsigned char b_val = vin.x;
        unsigned char g_val = vin.y;
        unsigned char r_val = vin.z;

        float float_r_val = ((1.574090f) * (float)r_val + (0.088825f) * (float)g_val + (-0.1909f)  * (float)b_val);
        float float_g_val = ((-0.344198f) * (float)r_val + (1.579802f) * (float)g_val + (-1.677604f)  * (float)b_val);
        float float_b_val = ((-1.012951f) * (float)r_val + (-1.781485f) * (float)g_val + (2.404436f)  * (float)b_val);

        uchar3 vout;
        vout.x = (unsigned char)fminf(255.f, float_r_val);
        vout.y = (unsigned char)fminf(255.f, float_g_val);
        vout.z = (unsigned char)fminf(255.f, float_b_val);

        p[idx] = vout;
    }
}

// Original kernel with a bit of template magic to conditionally correct
// accidental double precision arithmetic removed for brevity

int main()
{
    const size_t sz = iw * ih * numChannels;
    typedef unsigned char uchar;
    uchar * image = new uchar[sz];

    uchar v = 0;
    for(int i=0; i<sz; i++) {
        image[i] = v;
        v = (++v > 128) ? 0 : v;
    }

    uchar * image_;
    cudaMalloc((void **)&image_, sz);
    cudaMemcpy(image_, image, sz, cudaMemcpyHostToDevice);

    dim3 blocksz(32,32);
    dim3 gridsz(1+iw/blocksz.x, 1+ih/blocksz.y);
    cudaKernel<1><<<gridsz, blocksz>>>(image_, 1);
    cudaDeviceSynchronize();

    cudaMemcpy(image_, image, sz, cudaMemcpyHostToDevice);
    cudaKernel<0><<<gridsz, blocksz>>>(image_, 1);
    cudaDeviceSynchronize();

    cudaMemcpy(image_, image, sz, cudaMemcpyHostToDevice);
    cudaKernel3<<<16, 512>>>(image_, iw * ih);
    cudaDeviceSynchronize();

    cudaDeviceReset();

    return 0;
}

The idea here is to just have as many threads as can be resident on the device, and have them process the whole image, with each thread emitting multiple outputs. Block scheduling is very cheap in CUDA, but it isn't free, and neither are indexing calculations and all the other "setup" code required for one thread to do useful work. So the idea is simply to amortise those costs over many ouputs. Because your image is just linear memory and the operations you perform on each entry are completely independent, there is no point in using a 2D grid and 2D indexing. It is simply additional setup code which slows down the code. You will also see the use of a vector type (char3) which should improve memory throughput by reducing the number of memory transcations per pixel.

Also note that on double precision capable GPUs, double precision constants will be compiled and produce 64 bit floating point arithmetic. There is a 2 to 12 times performance penalty when performing double precision compared to single precision depending on your GPU. When I compile the kernel you posted and look at the PTX the CUDA 7 release compiler emits for the sm_30 architecture (the same as your GPU), I see this in the pixel computation code:

cvt.f64.f32     %fd1, %f4;
mul.f64         %fd2, %fd1, 0d3FF92F78FEEF5EC8;
ld.global.u8    %rs9, [%rd1+1];
cvt.rn.f32.u16  %f5, %rs9;
cvt.f64.f32     %fd3, %f5;
fma.rn.f64      %fd4, %fd3, 0d3FB6BD3C36113405, %fd2;
ld.global.u8    %rs10, [%rd1];
cvt.rn.f32.u16  %f6, %rs10;
cvt.f64.f32     %fd5, %f6;
fma.rn.f64      %fd6, %fd5, 0dBFC86F694467381D, %fd4;
cvt.rn.f32.f64  %f1, %fd6;
mul.f64         %fd7, %fd1, 0dBFD607570C564F98;
fma.rn.f64      %fd8, %fd3, 0d3FF946DE76427C7C, %fd7;
fma.rn.f64      %fd9, %fd5, 0dBFFAD7774ABA3876, %fd8;
cvt.rn.f32.f64  %f2, %fd9;
mul.f64         %fd10, %fd1, 0dBFF0350C1B97353B;
fma.rn.f64      %fd11, %fd3, 0dBFFC80F66A550870, %fd10;
fma.rn.f64      %fd12, %fd5, 0d40033C48F10A99B7, %fd11;
cvt.rn.f32.f64  %f3, %fd12;

Note there is promotion of everything to 64 bit floating point, and the multiplications are all done in 64 bit, with the floating point constants in IEEE754 double format, and the results are then demoted back to 32 bit. This is a real performance cost and you should be careful to avoid it by properly defined floating point constants as single precision.

When run on a GT620M (a 2 SM Fermi mobile part, running on batteries), we get the following profile data from nvprof

Time(%)      Time     Calls       Avg       Min       Max  Name
 39.44%  17.213ms         1  17.213ms  17.213ms  17.213ms  void cudaKernel<int=1>(unsigned char*, int)
 35.02%  15.284ms         3  5.0947ms  5.0290ms  5.2022ms  [CUDA memcpy HtoD]
 18.51%  8.0770ms         1  8.0770ms  8.0770ms  8.0770ms  void cudaKernel<int=0>(unsigned char*, int)
  7.03%  3.0662ms         1  3.0662ms  3.0662ms  3.0662ms  cudaKernel3(unsigned char*, int)

==5504== API calls:
Time(%)      Time     Calls       Avg       Min       Max  Name
 95.37%  1.01433s         1  1.01433s  1.01433s  1.01433s  cudaMalloc
  3.17%  33.672ms         3  11.224ms  4.8036ms  19.039ms  cudaDeviceSynchronize

  1.29%  13.706ms         3  4.5687ms  4.5423ms  4.5924ms  cudaMemcpy
  0.12%  1.2560ms        83  15.132us     427ns  541.81us  cuDeviceGetAttribute
  0.03%  329.28us         3  109.76us  91.086us  139.41us  cudaLaunch
  0.02%  209.54us         1  209.54us  209.54us  209.54us  cuDeviceGetName
  0.00%  23.520us         1  23.520us  23.520us  23.520us  cuDeviceTotalMem
  0.00%  13.685us         3  4.5610us  2.9930us  7.6980us  cudaConfigureCall
  0.00%  9.4090us         6  1.5680us     428ns  3.4210us  cudaSetupArgument
  0.00%  5.1320us         2  2.5660us  2.5660us  2.5660us  cuDeviceGetCount
  0.00%  2.5660us         2  1.2830us  1.2830us  1.2830us  cuDeviceGet

and when run on something bigger (GTX 670 Kepler device with 7 SMX):

==9442== NVPROF is profiling process 9442, command: ./a.out
==9442== Profiling application: ./a.out
==9442== Profiling result:
Time(%)      Time     Calls       Avg       Min       Max  Name
 65.68%  2.6976ms         3  899.19us  784.56us  1.0829ms  [CUDA memcpy HtoD]
 20.84%  856.05us         1  856.05us  856.05us  856.05us  void cudaKernel<int=1>(unsigned char*, int)
  7.90%  324.64us         1  324.64us  324.64us  324.64us  void cudaKernel<int=0>(unsigned char*, int)
  5.58%  229.12us         1  229.12us  229.12us  229.12us  cudaKernel3(unsigned char*, int)

==9442== API calls:
Time(%)      Time     Calls       Avg       Min       Max  Name
 55.88%  45.443ms         1  45.443ms  45.443ms  45.443ms  cudaMalloc
 38.16%  31.038ms         1  31.038ms  31.038ms  31.038ms  cudaDeviceReset
  3.55%  2.8842ms         3  961.40us  812.99us  1.1982ms  cudaMemcpy
  1.92%  1.5652ms         3  521.72us  294.16us  882.27us  cudaDeviceSynchronize
  0.32%  262.49us        83  3.1620us     150ns  110.94us  cuDeviceGetAttribute
  0.09%  74.253us         3  24.751us  15.575us  41.784us  cudaLaunch
  0.03%  22.568us         1  22.568us  22.568us  22.568us  cuDeviceTotalMem
  0.03%  20.815us         1  20.815us  20.815us  20.815us  cuDeviceGetName
  0.01%  7.3900us         6  1.2310us     200ns  5.3890us  cudaSetupArgument
  0.00%  3.6510us         2  1.8250us     674ns  2.9770us  cuDeviceGetCount
  0.00%  3.1440us         3  1.0480us     516ns  1.9410us  cudaConfigureCall
  0.00%  2.1600us         2  1.0800us     985ns  1.1750us  cuDeviceGet

So there is big speed up to be had just by fixing elementary mistakes and using sensible design patterns on both smaller and larger devices. Believe it, or not.

Answer 2

Shared memory won't help for your case, your memory accesses are not coaslescent.

You can try the following : replace your char* ptr_source into a uchar3* should probably helps your threads accessing contiguous datas in your array. uchar3 just means : 3 contiguous unsigned char.

since threads within a same warp execute same instruction at the same time you'll have this kind of access pattern :

Supposing you try to access memory at adress : 0x3F0000.

thread 1 copies data at : 0x3F0000 then 0x3F0001 then 0x3F0002

thread 2 copies data at : 0x3F0003 then 0x3F0004 then 0x3F0005

0x3F0000 and 0x3F0003 are not contiguous, so you'll have bad performance accessing to you datas.

with uchar3 uses :

thread 1 : 0x3F0000 to 0x3F0002

thread 2 : 0x3F0003 to 0x3F0005

like each thread copies continous datas your memory controller can copy it quickly.

You can too replace :

(float_r_val > 255.0f) ? 255 : static_cast<unsigned char>(float_r_val);

with

float_r_val = fmin(255.0f, float_r_val);

this should give you a kernel like this :

__global__ void cudaKernel(uchar3 *ptr_source, int numCudaStreams)
{

    // Calculate our pixel's location
    int x = (blockIdx.x * blockDim.x) + threadIdx.x;
    int y = (blockIdx.y * blockDim.y) + threadIdx.y;

    // Operate only if we are in the correct boundaries
    if (x >= 0 && x < iw && y >= 0 && y < ih / numCudaStreams)
    {
        const int index =   (iw*y + x);

        uchar3 val = ptr_source)[index];

        float float_r_val = ((1.574090f) * (float)val.x + (0.088825f) * (float)val.y + (-0.1909f)  * (float)b_val.z);
        float float_g_val = ((-0.344198f) * (float)val.x + (1.579802f) * (float)val.y + (-1.677604f)  * (float)b_val.z);
        float float_b_val = ((-1.012951f) * (float)val.x + (-1.781485f) * (float)val.y + (2.404436f)  * (float)b_val.z);

        ptr_source[index] = make_uchar3( fmin(255.0f, float_r_val), fmin(255.0f, float_g_val), fmin(255.0f, float_b_val) );
    }
}

i hope these update will improve performance.

Use of shared memory to reduce computational time of calculations inside CUDA kernel

Question

2 answers

solution1
2

solution2
1 ACCPTED 2015-07-21 13:55:41

Use of shared memory to reduce computational time of calculations inside CUDA kernel

Question

2 answers

solution1 2

solution2 1 ACCPTED 2015-07-21 13:55:41

solution1
2

solution2
1 ACCPTED 2015-07-21 13:55:41