简体   繁体   中英

Segmentation fault in CUDA

I receive Segmentation fault (core dumped) when i run this code.

I know the cudaMalloc is the problem, but I have no idea how to solve it. I just started learning some CUDA programming and im not familiar with it. I'm working on wsl if it matters.

#include<stdio.h>
#define SIZE 20
__global__ void VectorAdd(int *a,int *b,int *c,int n){
    int i = threadIdx.x;
    if(i<n)
        c[i]=a[i]+b[i];
}
int main(){
    int *a,*b,*c;
    cudaMalloc((void**)&a,SIZE *sizeof(int));
    cudaMalloc((void**)&b, SIZE *sizeof(int));
    cudaMalloc((void**)&c,SIZE *sizeof(int));

    for (int i=1;i<SIZE;i++){
        a[i]=i;
        b[i]=i;
        c[i]=0;
    }

    VectorAdd<<<1, SIZE>>>(a,b,c,SIZE);
    cudaDeviceSynchronize();

    for(int i=1;i<SIZE;i++){
        printf("%d \n",c[i]);
    }

    cudaFree(a);
    cudaFree(b);
    cudaFree(c);

    return 0;
}

As the comments already suggested, you have to initialize values for arrays a and b on the host, copy them to device array, and once computation is completed you have to copy data from c back to the host.

#include<stdio.h>
#define SIZE 20
__global__ void VectorAdd(int *a,int *b,int *c,int n){
    int i = threadIdx.x;
    if(i<n)
        c[i]=a[i]+b[i];
}

int main(){

    int *a,*b,*c;
    int *h_a, *h_b, *h_c; /*declare pointers to host arrays*/

    cudaMalloc((void**)&a,SIZE *sizeof(int));
    cudaMalloc((void**)&b, SIZE *sizeof(int));
    cudaMalloc((void**)&c,SIZE *sizeof(int));

    /* allocate memory for host arrays */
    h_a = new int[SIZE];
    h_b = new int[SIZE];
    h_c = new int[SIZE];

    /* initialize values on host arrays */
    for (int i = 0; i < SIZE; i++){
        h_a[i]=i;
        h_b[i]=i;
    }
    
    /*copy data from host to device */
    cudaMemcpy(a, h_a, SIZE*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(b, h_b, SIZE*sizeof(int), cudaMemcpyHostToDevice);

    VectorAdd<<<1, SIZE>>>(a,b,c,SIZE);
    // cudaDeviceSynchronize(); /* this is not needed because cudaMemcpy implies sync. */
    
    /*copy results from device to host*/
    cudaMemcpy(h_c, c, SIZE*sizeof(int), cudaMemcpyDeviceToHost);

    for(int i = 0; i < SIZE; i++){
        printf("%d \n",h_c[i]);
    }

    cudaFree(a);
    cudaFree(b);
    cudaFree(c);

    /* free host memory */
    delete [] h_a;
    delete [] h_b;
    delete [] h_c;

    return 0;
}

Notes

  • For some reason you start iterating from position 1 instead 0 in you for loops! If this is wrong by accident I fixed it!

  • cudaMemcpy always performs synchronization between host and device. So, cudaDeviceSynchronize() is not necessary after kernel invocation.

  • To avoid explicit handling of separated host and device data, you could use cudaMallocManaged instead of cudaMalloc .

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM