简体   繁体   中英

cuda kernel gives incorrect results by grid size increase

I am testing a simple CUDA algorithm for timing and I came across a case that when I increase the grid size of the kernel it gives incorrect results:

#include <unistd.h>
#include <stdio.h>
#include <assert.h>

/* we need these includes for CUDA's random number stuff */
#include <curand.h>
#include <curand_kernel.h>

#define MAX 10

#ifdef GRID
    #define REPEAT GRID
#else 
    #define REPEAT 65535  
#endif  

#ifdef VECSIZE
    #define SIZE VECSIZE 
#else 
    #define SIZE 1024  
#endif 


__global__ void random(int *result) {

    curandState_t state;
    curand_init(100, 0, threadIdx.x, &state);
    result[threadIdx.x] = curand(&state) % MAX;
    //printf("th %d random %d\n", threadIdx.x, *result);
}

__global__ void myadd(const int *in, int *sum) {
    sum[blockIdx.x] = 0;
    //printf("thread %d value %d\n",threadIdx.x,  in[threadIdx.x]);
    atomicAdd_block(&sum[blockIdx.x], in[threadIdx.x]);
    //atomicAdd(sum, in[threadIdx.x]);
}

int main() {
    int check = 0;
    /* allocate an int on the GPU */
    int *x = new int[SIZE];
    int *sum = new int[REPEAT];
    int *d_x, *d_sum;
    cudaMalloc(&d_x, sizeof(int) * SIZE);
    cudaMalloc(&d_sum, sizeof(int) * REPEAT);

    /* invoke the GPU to initialize all of the random states */
    random<<<1, SIZE>>>(d_x);

    myadd<<<REPEAT, SIZE>>>(d_x, d_sum);  
    cudaDeviceSynchronize();
        /* copy the random number back */
    cudaMemcpy(x, d_x, sizeof(int) * SIZE, cudaMemcpyDeviceToHost);
    cudaMemcpy(sum, d_sum, sizeof(int)* REPEAT, cudaMemcpyDeviceToHost);

    for (int i = 0; i < SIZE; ++i) {
        check += x[i];
        //printf("Random[%d] = %d\n", i, x[i]);
    }

    cudaError_t err = cudaGetLastError(); // Get error code

    if (err != cudaSuccess) {
        printf("CUDA Error: %s\n", cudaGetErrorString(err));
        exit(-1);
    }

    for (int i = 0; i < REPEAT; ++i) {
        printf("i %d check %d  sum[i] %d\n", i, check, sum[i]);
        assert(check == sum[i]);        
    }
        /* free the memory we allocated */
    cudaFree(d_x);
    cudaFree(d_sum);
    delete[] x;
    delete[] sum;

    return 0;
}

My card is V100 with compute capability of 7.0. As you can see I can compile the above code with different grid and vector sizes with nvcc test.cu -arch=sm_70 -O3 -g -G -DGRID=1024 -DVECSIZE=512 , for small vector and grid sizes everything looks good but when I increase the grid size to max (65535) sometimes the computed sum value is incorrect. For example:

.
.
.
i 511 check 2331  sum[i] 2331
i 512 check 2331  sum[i] 2331
i 513 check 2331  sum[i] 2188
a.out: test.cu:87: int main(): Assertion `check == sum[i]' failed.

There is a race condition in kernel myadd . The sum must only be set to 0 once. And it should not be set to 0 after some other threads added their value to it.

__global__ void myadd(const int *in, int *sum) {
    if(threadIdx.x == 0){
        sum[blockIdx.x] = 0;
    }
    __syncthreads(); // all threads wait until sum is initialized with 0

    atomicAdd_block(&sum[blockIdx.x], in[threadIdx.x]);
}

If you want to time your code properly, you should remove the -G compiler flag.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM