What is the significance of 'sharedMemBytes' argument in kernel call cuLaunchKernel()?

Question

I am trying to implement simple matrix multiplication program using shared memory in JCuda.

Following is my JCudaSharedMatrixMul.java code:

import static jcuda.driver.JCudaDriver.cuCtxCreate;
import static jcuda.driver.JCudaDriver.cuCtxSynchronize;
import static jcuda.driver.JCudaDriver.cuDeviceGet;
import static jcuda.driver.JCudaDriver.cuInit;
import static jcuda.driver.JCudaDriver.cuLaunchKernel;
import static jcuda.driver.JCudaDriver.cuMemAlloc;
import static jcuda.driver.JCudaDriver.cuMemFree;
import static jcuda.driver.JCudaDriver.cuMemcpyDtoH;
import static jcuda.driver.JCudaDriver.cuMemcpyHtoD;
import static jcuda.driver.JCudaDriver.cuModuleGetFunction;
import static jcuda.driver.JCudaDriver.cuModuleLoad;
import static jcuda.runtime.JCuda.cudaEventCreate;
import static jcuda.runtime.JCuda.cudaEventRecord;
import static jcuda.runtime.JCuda.*;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.Scanner;

import jcuda.Pointer;
import jcuda.Sizeof;
import jcuda.driver.CUcontext;
import jcuda.driver.CUdevice;
import jcuda.driver.CUdeviceptr;
import jcuda.driver.CUfunction;
import jcuda.driver.CUmodule;
import jcuda.driver.JCudaDriver;
import jcuda.runtime.cudaEvent_t;



public class JCudaSharedMatrixMul
{

    public static void main(String[] args) throws IOException 
    {
        // Enable exceptions and omit all subsequent error checks
        JCudaDriver.setExceptionsEnabled(true);

        // Create the PTX file by calling the NVCC
        String ptxFilename = preparePtxFile("JCudaSharedMatrixMulKernel.cu");

        //Initialize the driver and create a context for the first device.
        cuInit(0);
        CUdevice device = new CUdevice();
        cuDeviceGet (device, 0);
        CUcontext context = new CUcontext();
        cuCtxCreate(context, 0, device);

        //Load PTX file
        CUmodule module = new CUmodule();
        cuModuleLoad(module,ptxFilename);

        //Obtain a function pointer to the Add function
        CUfunction function = new CUfunction();
        cuModuleGetFunction(function, module, "jCudaSharedMatrixMulKernel");

        int numRows = 16;
        int numCols = 16;

        //Allocate and fill Host input Matrices:
        float hostMatrixA[] = new float[numRows*numCols];
        float hostMatrixB[] = new float[numRows*numCols];
        float hostMatrixC[] = new float[numRows*numCols];


        for(int i = 0; i<numRows; i++)

        {
            for(int j = 0; j<numCols; j++)
            {
                hostMatrixA[i*numCols+j] = (float) 1;
                hostMatrixB[i*numCols+j] = (float) 1;
            }
        }
        // Allocate the device input data, and copy the
        // host input data to the device
        CUdeviceptr devMatrixA = new CUdeviceptr();
        cuMemAlloc(devMatrixA, numRows * numCols * Sizeof.FLOAT);

        //This is the part where it gives me the error
        cuMemcpyHtoD(devMatrixA, Pointer.to(hostMatrixA), numRows * numCols * Sizeof.FLOAT);

        CUdeviceptr devMatrixB = new CUdeviceptr();
        cuMemAlloc(devMatrixB, numRows * numCols * Sizeof.FLOAT);

        //This is the part where it gives me the error
        cuMemcpyHtoD(devMatrixB, Pointer.to(hostMatrixB ), numRows * numCols * Sizeof.FLOAT);

        //Allocate device matrix C to store output
        CUdeviceptr devMatrixC = new CUdeviceptr();
        cuMemAlloc(devMatrixC, numRows * numCols * Sizeof.FLOAT);

        // Set up the kernel parameters: A pointer to an array
        // of pointers which point to the actual values.

        Pointer kernelParameters = Pointer.to(
                                Pointer.to(new int[]{numCols}), 
                                Pointer.to(devMatrixA),
                                Pointer.to(devMatrixB),
                                Pointer.to(devMatrixC));

        //Kernel thread configuration
        int blockSize = 16;
        int gridSize = 1;

        cudaEvent_t start = new cudaEvent_t();
        cudaEvent_t stop = new cudaEvent_t();
        cudaEventCreate(start);
        cudaEventCreate(stop);
        long start_nano=System.nanoTime();
        cudaEventRecord(start, null);

        cuLaunchKernel(function, 
                       gridSize, 1, 1,
                       blockSize, 16, 1,
                       250, null, kernelParameters, null);

        cuCtxSynchronize();
        cudaEventRecord(stop, null);
        long end_nano=System.nanoTime();
        float elapsedTimeMsArray[] = { Float.NaN };
        cudaEventElapsedTime(elapsedTimeMsArray, start, stop);
        float elapsedTimeMs = elapsedTimeMsArray[0];
        System.out.println("Time Required (Using cudaevent elapsed time) = " + " " +elapsedTimeMs+
                "Time Required (Using nanotime)= "+(end_nano-start_nano)/1000000);
        // Allocate host output memory and copy the device output
        // to the host.

        //This is the part where it gives me the error
        cuMemcpyDtoH(Pointer.to(hostMatrixC), devMatrixC, numRows * numCols * Sizeof.FLOAT);



        //verify the result
        for (int i =0; i<numRows; i++)
        {
            for (int j =0; j<numRows; j++)
            {
                System.out.print("   "+ hostMatrixC[i*numCols+j]);
            }
            System.out.println("");
        }

        cuMemFree(devMatrixA);
        cuMemFree(devMatrixB);
        cuMemFree(devMatrixC);
    }

    private static String preparePtxFile(String cuFileName) throws IOException
    {
        int endIndex = cuFileName.lastIndexOf('.');
        if (endIndex == -1)
            endIndex = cuFileName.length()-1;
        {
        }
        String ptxFileName = cuFileName.substring(0, endIndex+1)+"ptx";
        File ptxFile = new File(ptxFileName);
        if (ptxFile.exists())
        {
            return ptxFileName;
        }

        File cuFile = new File(cuFileName);
        if (!cuFile.exists())
        {
            throw new IOException("Input file not found: "+cuFileName);
        }
        String modelString = "-m"+System.getProperty("sun.arch.data.model");
        String command = "nvcc " + modelString + " -ptx "+ cuFile.getPath()+" -o "+ptxFileName;

        System.out.println("Executing\n"+command);
        Process process = Runtime.getRuntime().exec(command);

        String errorMessage = new String(toByteArray(process.getErrorStream()));
        String outputMessage = new String(toByteArray(process.getInputStream()));
        int exitValue = 0;
        try
        {
            exitValue = process.waitFor();
        }
        catch (InterruptedException e)
        {
            Thread.currentThread().interrupt();
            throw new IOException(
                "Interrupted while waiting for nvcc output", e);
        }

        if (exitValue != 0)
        {
            System.out.println("nvcc process exitValue "+exitValue);
            System.out.println("errorMessage:\n"+errorMessage);
            System.out.println("outputMessage:\n"+outputMessage);
            throw new IOException(
                "Could not create .ptx file: "+errorMessage);
        }

        System.out.println("Finished creating PTX file");
        return ptxFileName;

    }

    private static byte[] toByteArray(InputStream inputStream) throws IOException
    {
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        byte buffer[] = new byte[8192];
        while (true)
        {
            int read = inputStream.read(buffer);
            if (read == -1)
            {
                break;
            }
            baos.write(buffer, 0, read);
        }
        return baos.toByteArray();
    }


}

Following is my JCudaSharedMatrixMulKernel.cu code:

extern "C"
__global__ void jCudaSharedMatrixMulKernel(int N,float *ad,float *bd,float *cd)
{
    float pvalue=0;
    int TILE=blockDim.x;
    int ty=threadIdx.y;
    int tx=threadIdx.x;

    __shared__ float ads[4][4];
    __shared__ float bds[4][4];

    int Row = blockIdx.y * blockDim.y + threadIdx.y;
    int Col = blockIdx.x * blockDim.x + threadIdx.x;

    for(int i=0;i< N/TILE;++i)
    {
        ads[ty][tx] = ad[Row * N + (i * TILE) + tx];
        bds[ty][tx] = bd[(i * TILE + ty) * N + Col];

        __syncthreads();

        for(int k=0;k<TILE;k++)
                pvalue += ads[ty][k] * bds[k][tx];

        __syncthreads();  
    }

    cd[Row * N + Col] = pvalue;
}

In my above example total shared memory used per block is 2*4*4*4 = 128 bytes. In the cuLaunchKernel when I define sharedMemBytes parameter as 0(zero) then it gives me following error:

**Exception in thread "main" jcuda.CudaException: CUDA_ERROR_LAUNCH_FAILED
    at jcuda.driver.JCudaDriver.checkResult(JCudaDriver.java:282)
    at jcuda.driver.JCudaDriver.cuCtxSynchronize(JCudaDriver.java:1795)
    at JCudaSharedMatrixMul.main(JCudaSharedMatrixMul.java:121)**

When I define it as 128 then it gives the same above error. But when I make it as 129 then it gives me correct output! When I give any value between 129 to 49024 then it gives me the correct result. My question is why I am not able to get the correct output when I am defining it as 128? Also what is the maximum shared memory can be defined? Why this 129-49024 range is working here?

Answer 1

You're launching blocks of 16x16 threads:

    cuLaunchKernel(function, 
                   gridSize, 1, 1,
                   blockSize, 16, 1,  <-- the first two params are block.x and block.y
                   250, null, kernelParameters, null);

so __shared__ float ads[4][4]; should not be working at all. For example, these lines of kernel code would be accessing those shared arrays out-of-bounds for some threads:

    ads[ty][tx] = ad[Row * N + (i * TILE) + tx];
    bds[ty][tx] = bd[(i * TILE + ty) * N + Col];
         ^   ^
         |   tx goes from 0..15 for a 16x16 threadblock
         ty goes from 0..15 for a 16x16 threadblock

Your code is broken in this respect. If you run your code with cuda-memcheck it may catch these out-of-bounds accesses, even in your "passing" case. Looking at the matrixMulDrv cuda sample code , will be instructive, and you'll see that the shared memory allocation is 2*block_size*block_size , as it should be for your case as well, but your shared memory definitions should be [16][16] not [4][4] It may be that the shared memory allocation granularity just happens to work when you exceed 128 bytes, but there is a defect in your code.

Your shared definitions should be:

__shared__ float ads[16][16];
__shared__ float bds[16][16];

Since the above allocations are static allocations, and the sharedMemBytes parameter is defined as dynamic shared memory allocation, for this example you don't need to allocate any (0 is OK) dynamic shared memory, and it still works. The difference between static and dynamic is covered here .

The maximum shared memory per block is available in the documentation , or if you run the cuda deviceQuery sample code. It is 48K bytes for cc2.0 and newer devices.

What is the significance of 'sharedMemBytes' argument in kernel call cuLaunchKernel()?

Question

1 answers

solution1
3 ACCPTED 2014-08-18 11:15:14

What is the significance of 'sharedMemBytes' argument in kernel call cuLaunchKernel()?

Question

1 answers

solution1 3 ACCPTED 2014-08-18 11:15:14

solution1
3 ACCPTED 2014-08-18 11:15:14