Segmentation fault on Cuda

Question

I'm writing CUDA program to processing images. I got segmentation fault problem and I've no completely no idea why. I'm probably just miss some minor thing, but after hours of trying to correct myself I couldn't make it running.
I'm setting correct grid,block and shred memory values( at least I think so), according to deviceQuery on my hardware(GeForce310M total shmem:16384,max threads per block:512 and max block dim 521). Here is output I get before segmentation fault:

Cols and Rows:
256
384
total:98304 // rows*cols
Block Size:512
Grid Size:192
shared mem:2048

Below is the kernel code

__global__ void reduce_min(float *minVar,float* d_logLuminance)
{
    extern __shared__ float s_data[];   //shared memeory
    unsigned int tId = threadIdx.x;
    unsigned int global_id = blockIdx.x*blockDim.x + tId;

    //Copy to shared mem
    s_data[tId] = d_logLuminance[global_id];
    __syncthreads();

    for(unsigned int i = 1;i<blockDim.x;i*=2)
    {
        if(tId%(2*i) == 0)
        {
            s_data[tId] = min(s_data[tId],s_data[tId+i]);   
        }
        __syncthreads();
    }
    if(tId == 0)
        d_logLuminance[blockIdx.x] = s_data[0];
}

And the part where I allocate memory and invoke kernel

size_t size = 512;
    size_t sizeOfgrid = (numRows*numCols)/size;
    const dim3 blockSize(size,1,1); 
    const dim3 gridSize(sizeOfgrid,1,1);
    unsigned int sharedMem =(sizeof(float)*size); //Shared memory per block

    printf("%s\n%d\n%d\n%s%d\n","Cols and Rows:",numCols,numRows,"total:",numRows*numCols);
    printf("%s%d\n%s%d\n%s%d\n","Block Size:",size,"Grid Size:",sizeOfgrid,"shared mem:",sharedMem);

    float* d_lum;
    float* outData;
    checkCudaErrors(cudaMalloc(&d_lum,sizeof(float)*numRows*numCols));
        checkCudaErrors(cudaMemcpy(d_lum,d_logLuminance,sizeof(float)*numRows*numCols,cudaMemcpyDeviceToDevice));   
    checkCudaErrors(cudaMalloc(&outData,sizeof(float)*numRows*numCols));

    reduce_min<<<gridSize,blockSize,sharedMem>>>(outData,d_lum);
    cudaDeviceSynchronize();
    checkCudaErrors(cudaGetLastError());        
    checkCudaErrors(cudaMemcpy(&min_logLum,outData,sizeof(float)*numRows*numCols,cudaMemcpyDeviceToHost));

Thanks in advance for help.

Answer 1

checkCudaErrors(cudaMemcpy(&min_logLum,outData,sizeof(float)*numRows*numCols,cudaMemcpyDeviceToHost)); This function was generating an segmentation fault, beacause i fogrot to mention(and I missed it when I was writing code) that min_logLum is float , not a float* of size numRows*numCols . So in the end I was trying to copy much more memory had allocated.

Segmentation fault on Cuda

Question

1 answers

solution1
1 2013-07-31 14:18:11

Segmentation fault on Cuda

Question

1 answers

solution1 1 2013-07-31 14:18:11

solution1
1 2013-07-31 14:18:11