Optimizing opencl kernel

Question

I am trying to optimize this kernel. The CPU version of this kernel is 4 times faster than the GPU version. I would expect that the GPU version would be faster. It might be that we have a lot of memory accesses and that is why we have a low performance. I am using an Intel HD 2500 and OpenCL 1.2.

The GPU kernel is:

__kernel void mykernel(__global unsigned char *inp1,    
                        __global  unsigned char *inp2,      
                        __global  unsigned char *inp3,          
                        __global  unsigned char *inp4,          
                        __global  unsigned char *outp1,     
                        __global  unsigned char *outp2,     
                        __global  unsigned char *outp3,     
                        __global  unsigned char *outp4,     
                        __global  unsigned char *lut,           
                        uint size
                        )               
{
  unsigned char x1, x2, x3, x4;
  unsigned char y1, y2, y3, y4;
   const int x     = get_global_id(0);                      
   const int y     = get_global_id(1);                          
   const int width = get_global_size(0);                        
   const uint id = y * width + x;                               
    x1 = inp1[id];
    x2 = inp2[id];
    x3 = inp3[id];
    x4 = inp4[id];
    y1 = (x1 & 0xff) | (x2>>2 & 0xaa) | (x3>>4 & 0x0d) | (x4>>6 & 0x02);
    y2 = (x1<<2 & 0xff) | (x2 & 0xaa) | (x3>>2 & 0x0d) | (x4>>4 & 0x02);
    y3 = (x1<<4 & 0xff) | (x2<<2 & 0xaa) | (x3 & 0x0d) | (x4>>2 & 0x02);
    y4 = (x1<<6 & 0xff) | (x2<<4 & 0xaa) | (x3<<2 & 0x0d) | (x4 & 0x02);
    // lookup table
    y1 = lut[y1];
    y2 = lut[y2];
    y3 = lut[y3];
    y4 = lut[y4];
    outp1[id] =    (y1 & 0xc0)
                 | ((y2 & 0xc0) >> 2)
                 | ((y3 & 0xc0) >> 4)
                 | ((y4 & 0xc0) >> 6);        
    outp2[id] =   ((y1 & 0x30) << 2)
                 |  (y2 & 0x30)
                 | ((y3 & 0x30) >> 2)
                 | ((y4 & 0x30) >> 4);             
    outp3[id] =   ((y1 & 0x0c) << 4)
                 | ((y2 & 0x0c) << 2)
                 |  (y3 & 0x0c)
                 | ((y4 & 0x0c) >> 2);            
    outp4[id] =   ((y1 & 0x03) << 6)
                 | ((y2 & 0x03) << 4)
                 | ((y3 & 0x03) << 2)
                 |  (y4 & 0x03);
}

I use :

   size_t localWorkSize[1], globalWorkSize[1];
   localWorkSize[0] = 1;
   globalWorkSize[0] = X*Y; // X,Y define a data space of 15 - 20 MB

LocalWorkSize can vary between 1 - 256.

for LocalWorkSize = 1 I have 
CPU = 0.067Sec
GPU = 0.20Sec
for LocalWorkSize = 256 I have 
CPU = 0.067Sec
GPU = 0.34Sec

Which is really weird. Can you give me some ideas why I get these strange numbers? and do you have any tips on how I can optimize this kernel?

My main looks like this:

int main(int argc, char** argv)
{
int err,err1,j,i;                     // error code returned from api calls and other
   clock_t start, end;                 // measuring performance variables
   cl_device_id device_id;             // compute device id 
   cl_context context;                 // compute context
   cl_command_queue commands;          // compute command queue
   cl_program program_ms_naive;       // compute program
   cl_kernel kernel_ms_naive;         // compute kernel
   // ... dynamically allocate arrays
   // ... initialize arrays
 cl_uint dev_cnt = 0;
   clGetPlatformIDs(0, 0, &dev_cnt);

   cl_platform_id platform_ids[100];
   clGetPlatformIDs(dev_cnt, platform_ids, NULL);
   // Connect to a compute device
   err = clGetDeviceIDs(platform_ids[0], CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
    // Create a compute context 
   context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
   // Create a command queue
   commands = clCreateCommandQueue(context, device_id, 0, &err);
   // Create the compute programs from the source file
   program_ms_naive = clCreateProgramWithSource(context, 1, (const char **) &kernelSource_ms, NULL, &err);
    // Build the programs executable
   err = clBuildProgram(program_ms_naive, 0, NULL, NULL, NULL, NULL);
    // Create the compute kernel in the program we wish to run
   kernel_ms_naive = clCreateKernel(program_ms_naive, "ms_naive", &err);

   d_A1 = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size_cpy/4, h_A1, &err);
   d_A2 = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size_cpy/4, h_A2, &err);
   d_A3 = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size_cpy/4, h_A3, &err);
   d_A4 = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size_cpy/4, h_A4, &err);
   d_lut = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 256, h_ltable, &err);
   d_B1 = clCreateBuffer(context, CL_MEM_WRITE_ONLY, mem_size_cpy/4, NULL, &err);
   d_B2 = clCreateBuffer(context, CL_MEM_WRITE_ONLY, mem_size_cpy/4, NULL, &err);
   d_B3 = clCreateBuffer(context, CL_MEM_WRITE_ONLY, mem_size_cpy/4, NULL, &err);
   d_B4 = clCreateBuffer(context, CL_MEM_WRITE_ONLY, mem_size_cpy/4, NULL, &err);

   int size = YCOLUMNS*XROWS/4; 
   int size_b = size * 4;
   err = clSetKernelArg(kernel_ms_naive,  0, sizeof(cl_mem), (void *)&(d_A1));
   err |= clSetKernelArg(kernel_ms_naive, 1, sizeof(cl_mem), (void *)&(d_A2));
   err |= clSetKernelArg(kernel_ms_naive, 2, sizeof(cl_mem), (void *)&(d_A3));
   err |= clSetKernelArg(kernel_ms_naive, 3, sizeof(cl_mem), (void *)&(d_A4));
   err |= clSetKernelArg(kernel_ms_naive, 4, sizeof(cl_mem), (void *)&d_B1);
   err |= clSetKernelArg(kernel_ms_naive, 5, sizeof(cl_mem), (void *)&(d_B2));
   err |= clSetKernelArg(kernel_ms_naive, 6, sizeof(cl_mem), (void *)&(d_B3));
   err |= clSetKernelArg(kernel_ms_naive, 7, sizeof(cl_mem), (void *)&(d_B4));
   err |= clSetKernelArg(kernel_ms_naive, 8, sizeof(cl_mem), (void *)&d_lut); //__global
   err |= clSetKernelArg(kernel_ms_naive, 9, sizeof(cl_uint), (void *)&size_b);
   size_t localWorkSize[1], globalWorkSize[1];
   localWorkSize[0] = 256;
   globalWorkSize[0] = XROWS*YCOLUMNS;
   start = clock(); 
   for (i=0;i< EXECUTION_TIMES;i++)
   {
       err1 = clEnqueueNDRangeKernel(commands, kernel_ms_naive, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
       err = clFinish(commands);
    }
   end = clock();

return 0;
}

Answer 1

Constant memory is used to broadcast small amount of values to all the work items and acts similar to a constant private register, thus very fast access speed. Normal GPU devices can support up to 16kb of constant memory. Should be enough to hold the LUT.

You can try with constant memory, as a simple solution for the global access bottleneck:

__kernel void mykernel(const __global unsigned char *inp1,    
                        const __global  unsigned char *inp2,      
                        const __global  unsigned char *inp3,          
                        const __global  unsigned char *inp4,          
                        __global  unsigned char *outp1,     
                        __global  unsigned char *outp2,     
                        __global  unsigned char *outp3,     
                        __global  unsigned char *outp4,     
                        __constant unsigned char *lut,           
                        uint size
                        )               
{
  ...
}

But a proper solution would be to reshape your code:

Use vectors of char4 instead of 4 different buffers (because that breaks coalescence) [It can give you a big boost up to x4]
Operate on vectors [Slight boost]
Use local/constant memory for LUT [It can reduce 1 non coalesced read of the LUT, maybe 2x-3x]

Still it will be difficult to beat the CPU approach, due to big IO constrains.

Optimizing opencl kernel

Question

1 answers

solution1
2 2015-10-22 17:06:21

Optimizing opencl kernel

Question

1 answers

solution1 2 2015-10-22 17:06:21

solution1
2 2015-10-22 17:06:21