Dereferencing pointer in CUDA C

Question

I am a novice C programmer and was a bit confused about this segmentation fault. I have worked with pointers before and this doesn't make sense. This code is being done on an NVIDIA GPU but I am not using any of the CUDA API functions yet (commented them out to isolate the error).

I get the error when de-referencing the pointer *mu on the GPU (see code below) in the function calibrate. That is, the error is a segmentation fault.

My host code is:

/******************************************************************************
 *cr
 *cr
 ******************************************************************************/

#include <stdio.h>
#include <stdlib.h>
#include "kernel.cu"
#include "support.h"

int main (int argc, char *argv[])
{

    Timer timer;
    cudaError_t cuda_ret;

    // Initialize host variables ----------------------------------------------

    printf("\nSetting up the problem...\n"); fflush(stdout);
    startTime(&timer);

    double* A_h, *T_h, *Delta_h, *E_h, *p_h, *p2_h, *D_h, *Times_h, *ones_h; 
    double* A_d, *T_d, *Delta_d, *E_d, *p_d, *p2_d, *D_d, *Times_d, *ones_d, *temp_1, *temp_2; 
    double* mu_h, *alpha_h, *omega_h;
    double* mu_d, *alpha_d, *omega_d;
    int N;
    unsigned int mat_size, vec_size;

    // Import data
    FILE *fp;
    char str[60];   
    unsigned int count=0;
    double d;

    /* opening file for reading */
    fp = fopen("AAPL_data.txt","r");

    if(fp == NULL) {
      perror("Error opening file");
      return(-1);
    }
    while(fgets (str, 60, fp)!=NULL)
        ++count;    

    // Stick with a limited subset of the data for now
    N = 2000;

    fclose(fp); 
    printf("Count is %u \n",count);     

    mat_size = N*N;
    vec_size = N;

    dim3 dim_grid, dim_block;

    // Fill matrices with 0's
    A_h = (double*) malloc( sizeof(double)*mat_size );
    for (unsigned int i=0; i < mat_size; ++i) { A_h[i] = 0; }

    T_h = (double*) malloc( sizeof(double)*mat_size );
    for (unsigned int i=0; i < mat_size; ++i) { T_h[i] = 0; }

    Delta_h = (double*) malloc( sizeof(double)*mat_size );
    for (unsigned int i=0; i < mat_size; ++i) { Delta_h[i] = 0; }

    E_h = (double*) malloc( sizeof(double)*mat_size );
    for (unsigned int i=0; i < mat_size; ++i) { E_h[i] = 0; }

    p_h = (double*) malloc( sizeof(double)*mat_size );
    for (unsigned int i=0; i < mat_size; ++i) { p_h[i] = 0; }

    // Fill vectors with 0's, except the 1's vector
    p2_h = (double*) malloc( sizeof(double)*vec_size );
    for (unsigned int i=0; i < vec_size; ++i) { p2_h[i] = 0; }

    Times_h = (double*) malloc( sizeof(double)*vec_size );
    for (unsigned int i=0; i < vec_size; ++i) { Times_h[i] = 0; }

    D_h = (double*) malloc( sizeof(double)*vec_size );
    for (unsigned int i=0; i < vec_size; ++i) { D_h[i] = 0; }

    ones_h = (double*) malloc( sizeof(double)*vec_size );
    for (unsigned int i=0; i < vec_size; ++i) { ones_h[i] = 0; }

    // Start constants as zero
    mu_h    = (double*) malloc( sizeof(double));
    alpha_h = (double*) malloc( sizeof(double));
    omega_h = (double*) malloc( sizeof(double));
    *mu_h = 0;
    *alpha_h = 0;
    *omega_h = 0;

    // Import data
    count=0;

    /* opening file for reading */
    fp = fopen("AAPL_data.txt","r");

    if(fp == NULL) {
      perror("Error opening file");
      return(-1);
    }       
    while(fgets (str, 60, fp)!=NULL)
    {
        sscanf(str, "%lf", &d);
        if(count < vec_size)
            Times_h[count] = d;
        ++count;
    }       
    fclose(fp); 


    /*printf("TIMES VECTOR: \n");   
    for (unsigned int i=0; i < vec_size; ++i) 
    { 
        printf("TIMES_H[ %u ] is ",i);
        printf("%f \n", Times_h[i]);
    }*/

    printf("Count is %u \n",count);     
    stopTime(&timer); printf("%f s\n", elapsedTime(timer));

    // Allocate device variables ----------------------------------------------

    printf("Allocating device variables..."); fflush(stdout);
    startTime(&timer);

    cudaMalloc((void**) &A_d, mat_size*sizeof(double));                     // Create device variable for matrix A  
    cudaMalloc((void**) &T_d, mat_size*sizeof(double));                     // Create device variable for matrix T  
    cudaMalloc((void**) &Delta_d, mat_size*sizeof(double));                 // Create device variable for matrix Delta
    cudaMalloc((void**) &E_d, mat_size*sizeof(double));                     // Create device variable for matrix E
    cudaMalloc((void**) &p_d, mat_size*sizeof(double));                     // Create device variable for matrix p
    cudaMalloc((void**) &p2_d, vec_size*sizeof(double));                    // Create device variable for vector p2
    cudaMalloc((void**) &D_d, vec_size*sizeof(double));                     // Create device variable for vector D
    cudaMalloc((void**) &Times_d, vec_size*sizeof(double));                 // Create device variable for vector Times
    cudaMalloc((void**) &ones_d, vec_size*sizeof(double));                  // Create device variable for vector ones
    cudaMalloc((void**) &mu_d, sizeof(double));                             // Create device variable for constant mu
    cudaMalloc((void**) &alpha_d, sizeof(double));                          // Create device variable for constant alpha
    cudaMalloc((void**) &omega_d, sizeof(double));                          // Create device variable for constant omega
    cudaMalloc((void**) &temp_1, vec_size*sizeof(double));                  // Create device variable for constant omega
    cudaMalloc((void**) &temp_2, mat_size*sizeof(double));                  // Create device variable for constant omega

    cudaDeviceSynchronize();
    stopTime(&timer); printf("%f s\n", elapsedTime(timer));

    // Copy host variables to device ------------------------------------------

    printf("Copying data from host to device..."); fflush(stdout);
    startTime(&timer);

    cudaMemcpy(A_d,A_h,mat_size*sizeof(double), cudaMemcpyHostToDevice);            // Copy from host var to device var
    cudaMemcpy(T_d,T_h,mat_size*sizeof(double), cudaMemcpyHostToDevice);            // Copy from host var to device var
    cudaMemcpy(Delta_d,Delta_h,mat_size*sizeof(double), cudaMemcpyHostToDevice);    // Copy from host var to device var
    cudaMemcpy(E_d,E_h,mat_size*sizeof(double), cudaMemcpyHostToDevice);            // Copy from host var to device var
    cudaMemcpy(p_d,p_h,mat_size*sizeof(double), cudaMemcpyHostToDevice);            // Copy from host var to device var
    cudaMemcpy(p2_d,p2_h,vec_size*sizeof(double), cudaMemcpyHostToDevice);          // Copy from host var to device var
    cudaMemcpy(D_d,D_h,vec_size*sizeof(double), cudaMemcpyHostToDevice);            // Copy from host var to device var
    cudaMemcpy(ones_d,ones_h,vec_size*sizeof(double), cudaMemcpyHostToDevice);      // Copy from host var to device var
    cudaMemcpy(Times_d,Times_h,mat_size*sizeof(double), cudaMemcpyHostToDevice);    // Copy from host var to device var
    cudaMemcpy(mu_d,mu_h,sizeof(double), cudaMemcpyHostToDevice);                   // Copy from host var to device var
    cudaMemcpy(alpha_d,alpha_h,sizeof(double), cudaMemcpyHostToDevice);             // Copy from host var to device var
    cudaMemcpy(omega_d,omega_h,sizeof(double), cudaMemcpyHostToDevice);             // Copy from host var to device var

    cudaMemcpy(temp_1,D_h,vec_size*sizeof(double), cudaMemcpyHostToDevice);         // Copy from host var to device var
    cudaMemcpy(temp_2,A_h,mat_size*sizeof(double), cudaMemcpyHostToDevice);         // Copy from host var to device var


    cudaDeviceSynchronize();
    stopTime(&timer); printf("%f s\n", elapsedTime(timer));

    // Launch kernel using standard sgemm interface ---------------------------
    printf("Launching kernel..."); fflush(stdout);
    startTime(&timer);

    int MAX_ITER = 100;
    double TOL = .001;

    calibrate(vec_size,mu_d, alpha_d, omega_d, A_d, T_d, Delta_d, E_d, p_d, p2_d, D_d, ones_d, Times_d, 
        MAX_ITER, TOL, temp_1, temp_2);


    //tiledSgemm('N', 'N', matArow, matBcol, matBrow, 1.0f, \
    //  A_d, matArow, B_d, matBrow, 0.0f, C_d, matBrow); // A1_d, B1_d);

    cuda_ret = cudaDeviceSynchronize();
    if(cuda_ret != cudaSuccess) FATAL("Unable to launch kernel");
    stopTime(&timer); printf("%f s\n", elapsedTime(timer));

    // Copy device variables from host ----------------------------------------

    printf("Copying data from device to host...\n"); fflush(stdout);
    startTime(&timer);


    cudaMemcpy(mu_h,mu_d,sizeof(float), cudaMemcpyDeviceToHost);        // Copy from device var to host var
    cudaMemcpy(alpha_h,alpha_d,sizeof(float), cudaMemcpyDeviceToHost);  // Copy from device var to host var
    cudaMemcpy(omega_h,omega_d,sizeof(float), cudaMemcpyDeviceToHost);  // Copy from device var to host var

    printf("mu is %f: \n",mu_h);
    printf("alpha is %f: \n",alpha_h);
    printf("omega is %f: \n",omega_h);

    cudaDeviceSynchronize();
    stopTime(&timer); printf("%f s\n", elapsedTime(timer));


    // Free memory ------------------------------------------------------------

    free(A_h);
    free(T_h);
    free(Delta_h);
    free(E_h);
    free(p_h);
    free(p2_h);
    free(D_h);
    free(ones_h);
    free(Times_h);
    free(mu_h);
    free(alpha_h);
    free(omega_h);

    cudaFree(A_d);
    cudaFree(T_d);
    cudaFree(Delta_d);
    cudaFree(E_d);
    cudaFree(p_d);
    cudaFree(p2_d);
    cudaFree(D_d);
    cudaFree(ones_d);
    cudaFree(Times_d);
    cudaFree(mu_d);
    cudaFree(alpha_d);
    cudaFree(omega_d);

    return 0;
}

The Kernel code on the GPU is:

/*****************************************************************************************/
#include <stdio.h>

#define TILE_SIZE 16
#define BLOCK_SIZE 512

__global__ void mysgemm(int m, int n, int k, const double *A, const double *B, double* C) {

    __shared__ float ds_A[TILE_SIZE][TILE_SIZE];
    __shared__ float ds_B[TILE_SIZE][TILE_SIZE];

    int bx = blockIdx.x;
    int by = blockIdx.y;
    int tx = threadIdx.x;
    int ty = threadIdx.y;
    int row = (by*TILE_SIZE+ty);//%m;
    int col = (bx*TILE_SIZE+tx);//%n;
    float pvalue = 0;


    for(int i=0;i<(k-1)/TILE_SIZE+1;++i)
    {
        if((i*TILE_SIZE +tx < k) && (row < m))
            ds_A[ty][tx] = A[row*k+i*TILE_SIZE+tx];
        else ds_A[ty][tx] = 0;

        if((i*TILE_SIZE+ty < k) && (col < n)) 
            ds_B[ty][tx] = B[(i*TILE_SIZE+ty)*n+col];       // Load data into shared memory
        else ds_B[ty][tx] = 0;

        __syncthreads();

        if(row < m && col < n)
        {
            for(int j=0;j<TILE_SIZE;++j)
            {
                //if(j < k)
                    pvalue += ds_A[ty][j]*ds_B[j][tx];
            }
            }
        __syncthreads();
    }

    if(row < m && col < n)
        C[row*n+col] = pvalue;
}

// Kernel to multiply each element in A by the corresponding element in B and store 
// the result to the corresponding element in C. All vectors should be of length m
__global__ void elem_mul(int m, const double *A, const double *B, double* C) 
{
    int bx = blockIdx.x;
    int tx = threadIdx.x;
    int i = tx+bx*blockDim.x; 
    if(i < m)
        C[i] = A[i]*B[i];
}

// Kernel for parallel sum
__global__ void reduction(double *out, double *in, unsigned size)
{
    __shared__ float partialSum[2*BLOCK_SIZE];
    unsigned int t = threadIdx.x;
    unsigned int start = 2*blockIdx.x*blockDim.x;

    if(start + t >= size)
        partialSum[t] = 0;
    else partialSum[t] = in[start+t];

    if(start + blockDim.x+t>= size)
        partialSum[blockDim.x+t] = 0;
    else partialSum[blockDim.x+t] = in[start + blockDim.x+t];

    for(unsigned int stride = 1; stride <=blockDim.x; stride*=2)
    {
        __syncthreads();
        if(t % stride ==0)
            partialSum[2*t]+=partialSum[2*t+stride];
    }

    __syncthreads();

    out[blockIdx.x] = partialSum[0];
}

// Uses several kernels to compute the inner product of A and B
void inner_product(double *out, int m, const double *A, const double* B, double* temp)
{
    dim3    dimGrid((m-1)/BLOCK_SIZE+1,(m-1)/BLOCK_SIZE+1,1);
    dim3    dimBlock(BLOCK_SIZE,BLOCK_SIZE,1);
    elem_mul<<<dimGrid,dimBlock>>>(m,A,B,temp);
    reduction<<<dimGrid,dimBlock>>>(out,temp,m);        
}

// Kernel to multiply each element in the matrix out in the following manner:
// out(i,j) = in(i) - in(j)
__global__ void fill(int m, const double *in, double *out) 
{
    int bx = blockIdx.x;
    int by = blockIdx.y;    
    int tx = threadIdx.x;
    int ty = threadIdx.y;

    int i = tx+bx*blockDim.x; 
    int j = ty+by*blockDim.y; 

    if((i < m) && (j < m))
        out[i*m+j] = in[i]-in[j];
}

// Kernel to fill the matrix out with the formula out(i,j) = exp(-omega*T(i.j))
__global__ void fill_E(int m, double coeff, double *in, double *out) 
{
    int bx = blockIdx.x;
    int tx = threadIdx.x;       
    int i = tx+bx*blockDim.x; 

    if(i < m)
        out[i] = exp(-coeff * in[i]);
}

// Kernel for scalar multiplication for an mxk matirx and a coefficient coeff
__global__ void scal_mul(int m, int k, double coeff, double *in, double *out) 
{
    int bx = blockIdx.x;
    int tx = threadIdx.x;       
    int i = tx+bx*blockDim.x; 

    if(i < m*k)
        out[i] = coeff * in[i];
}

// Kernel for scalar multiplication for an mxk matirx and a coefficient coeff
__global__ void scal_add(int m, int k, double coeff, double *in, double *out) 
{
    int bx = blockIdx.x;
    int tx = threadIdx.x;       
    int i = tx+bx*blockDim.x; 

    if(i < m*k)
        out[i] = coeff + in[i];
}

// Kernel to update vector p2
__global__ void update_p2(int m, double coeff, double *in, double *out) 
{
    int bx = blockIdx.x;
    int tx = threadIdx.x;       
    int i = tx+bx*blockDim.x; 

    if(i < m)
        out[i] = coeff/in[i];
}

// Kernel to update matrix p
__global__ void update_p(int m, double* p2, double *denom, double *num, double *out) 
{
    int bx = blockIdx.x;
    int tx = threadIdx.x;       
    int i = tx+bx*blockDim.x; 

    // loop through columns j
    for(int j=0; j<m; ++j)
    {
        if(i == j)
            out[i*m + j] = p2[i];
        else if(i < m)
            out[i*m + j] = num[i*m+j]/denom[i];
    }
}


/*****************************************************************************************/
// int size:  length of the Time-series vectors. Also the number of rows and columns in input matrices
// double mu:       One of three parameters calibrated
// double alpha:    One of three parameters calibrated
// double omega:    One of three parameters calibrated
// double* A:       A matrix filled out and used to calibrate
// double* T:       A distance matrix T(i,j) = Times[i]-Times[j]
// double* Delta:   A dissimilarity matrix Delta(i,j) = 1 if i > j, 0 otherwise
// double* E:       A matrix filled out and used to calibrate--E(i,j) = exp(-omega*T(i,j))
// double* p:       A probability matrix of cross excitations
// double* p2:      A vector of self-excitation probabilities
// double* ones:    A (size x 1) vector of 1's used in inner products and identity transformations
// double* Times:   A (size x 1) vector of time series data to be calibrated
// int MAX_ITER:    The maximum number of iterations allowed in the calibration
// double* TOL:     The error tolerance or accuracy allowed in the calibration
// double* temp_1:  A (size x 1) temporary vector used in intermediate calculations 
// double* temp_2:  A temporary matrix used in intermediate calculations
/*****************************************************************************************/
void calibrate(int size, double *mu, double *alpha, double *omega, double *A, double *T, double *Delta, double *E, double *p, double *p2, double *D, double* ones, double *Times, int MAX_ITER, double TOL, double* temp_1, double* temp_2)
{   

    //1) (a) Perform inner product to start initial values of mu, alpha, and omega
    *mu = .11; // ERROR IS HERE!!
    /*
    inner_product(mu, size, Times, ones, temp_1);

    double a = *(mu);
    a = a/size;
    *mu = .11;

    /*  
    /size;
    *alpha =  *mu;
    *omega =  *mu;


    double mu_t = 0;
    double alpha_t = 0;
    double omega_t = 0;
    double err = 0;
    int ctr = 0;

    //1) (b) Fill out matrix T of time differences
    dim3    dimGrid((size-1)/BLOCK_SIZE+1,(size-1)/BLOCK_SIZE+1,1);
    dim3    dimBlock(BLOCK_SIZE,BLOCK_SIZE,1);
    fill<<<dimGrid,dimBlock>>>(size, Times, T); 


    while(ctr < MAX_ITER && err < TOL)
    {
        // 2) Fill out matrix E
        dim3    dimGrid((size-1)/BLOCK_SIZE+1,(size-1)/BLOCK_SIZE+1,1);
        dim3    dimBlock(BLOCK_SIZE,BLOCK_SIZE,1);
        fill_E<<<dimGrid,dimBlock>>>(size, omega, T, E);

        // 3) Update matrix A
        dim3    dimGrid((size-1)/BLOCK_SIZE+1,(size-1)/BLOCK_SIZE+1,1);
        dim3    dimBlock(BLOCK_SIZE,BLOCK_SIZE,1);
        scal_mult<<<dimGrid,dimBlock>>>(size,size, alpha, delta, A);
        scal_mult<<<dimGrid,dimBlock>>>(size,size, omega, A, A);

        dim3    dimGrid((n-1)/TILE_SIZE+1,(m-1)/TILE_SIZE+1,1);
        dim3    dimBlock(TILE_SIZE,TILE_SIZE,1);
        mysgemm<<<dimGrid,dimBlock>>>(size,size,size,A,E,A)


        // 4) Update matrix D 
        mysgemm<<<dimGrid,dimBlock>>>(size,size,1,A,ones,D);
        scal_add<<<dimGrid,dimBlock>>>(size,size, mu, D, D);

        // 5) Update matrix p and vector p2
        update_p2<<<dimGrid,dimBlock>>>(size,mu, D, p2);
        update_p<<<dimGrid,dimBlock>>>(size,p2, D, A, p);

        // 6) Update parameters mu, alpha, omega
        inner_product(mu_t, size, p2, ones, temp_1);
        mu_t /=Times[size-1];

        reduction<<<dimGrid,dimBlock>>>(alpha_t,p,size*size);
        alpha_t/= size;

        // Treat T and p as very long vectors and calculate the inner product
        inner_product(omega_t, size*size, T, p, temp_2);
        omega_t = alpha_t/omega_t;

        // 7) Update error
        ctr++;
        err = (mu - mu_t)*(mu - mu_t) + (alpha-alpha_t)*(alpha-alpha_t) + (omega-omega_t)*(omega-omega_t);
        mu = mu_t;
        alpha = alpha_t;
        omega = omega_t;

        cudaError_t error = cudaGetLastError();
        if(error != cudaSuccess)
        {
            printf("CUDA error: %s\n",cudaGetErrorString(error));
            exit(-1);
        }       
    }
    */
}

However, I think 99% of this code isn't relevant to the issue (I use nothing from "support.h" at the moment. Basically, I get an error de-referencing the pointer on the GPU, even though it is presumably not null. Thanks!

Answer 1

If you do proper cuda error checking you'll discover another problem with your code, this line:

cudaMemcpy(Times_d,Times_h,mat_size*sizeof(double), cudaMemcpyHostToDevice);

should be something like this:

cudaMemcpy(Times_d,Times_h,vec_size*sizeof(double), cudaMemcpyHostToDevice);

However that's not the crux of the issue. It took me a while to figure out that you are not making any kernel calls. If you call a kernel, all the parameters you pass to that kernel must be accessible by the device. So if you pass a pointer, the pointer must point to device memory. You are doing this with mu_d which is a device pointer:

calibrate(vec_size,mu_d,...

But your calibrate is not a kernel!!

It's an ordinary host function running on the host (CPU). So when you try and dereference the device pointer mu_d in host code:

*mu = .11; // ERROR IS HERE!!

You get a seg fault. I'm not sure why you're trying to debug this way, but simply converting kernel calls to host routines, while leaving all the parameters the same, is not a valid way to debug.

Fundamental CUDA rules (ignoring cuda 6 Unified Memory):

you cannot dereference a host pointer in device code
you cannot dereference a device pointer in host code

Your code is a violation of the 2nd rule above.

Dereferencing pointer in CUDA C

Question

1 answers

solution1
3 ACCPTED 2013-12-16 15:34:16

Dereferencing pointer in CUDA C

Question

1 answers

solution1 3 ACCPTED 2013-12-16 15:34:16

solution1
3 ACCPTED 2013-12-16 15:34:16