cuda shared memory, no synchronisation in kernel, premature output from kernel

Question

After hours of narrowing down an observation in another project I came up with the following cuda code:

#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

void __syncthreads();

int* cudata;

__device__ void workMatrix(int** mat, int dim) {
    int r = threadIdx.y; //row index into matrix
    int c = threadIdx.x; //column index into matrix
    if (r < dim && c < dim) mat[r][c] *= -2;
}

__global__ void kernelTest(int* data, int dim) {
    extern __shared__ int shd[]; //shared array size [dim * dim]
    int** mat = new int* [dim]; //use 2D-indexing into shared array
    for (int i = 0; i < dim; i++) mat[i] = shd + i * dim;
    int idx = blockDim.y * threadIdx.y + threadIdx.x;
    if (idx < dim * dim) {
        shd[idx] = data[idx];
        workMatrix(mat, dim);
    }
    __syncthreads(); //DOES NOT HAVE ANY EFFECT, HOW TO SYNCHRONIZE HERE???
    if (idx < dim * dim) {
        data[idx] = shd[idx];
    }
    delete[] mat;
}

void test(int dim, int threads) {
    //setup input array
    int siz = dim * dim;
    int* data = new int[siz];
    for (int i = 0; i < siz; i++) data[i] = i;
    printf("input data [%d..%d] ", data[0], data[siz - 1]);
    //copy data to device
    cudaMalloc(&cudata, siz * sizeof(int));
    cudaMemcpy(cudata, data, siz * sizeof(int), cudaMemcpyDefault);
    //run kernel
    dim3 thr(threads, threads);
    kernelTest <<<1, thr, siz * sizeof(int) >>> (cudata, dim);
    cudaDeviceSynchronize();
    //return data to host
    int* returnData = new int[siz];
    cudaMemcpy(returnData, cudata, siz * sizeof(int), cudaMemcpyDefault);
    //analyse and print results
    bool ok = true;
    for (int i = 0; i < siz; i++) ok &= (returnData[i] == data[i] * -2);
    printf("dim=%d, threads=%d ", dim, threads);
    if (ok) {
        printf("OK\n");

    } else {
        printf("FAIL [");
        for (int i = 0; i < siz; i++) printf("%d ", returnData[i]);
        printf("]\n");
    }
    //clear up memory
    cudaFree(cudata);
    delete[] data;
    delete[] returnData;
}

int main() {
    printf("Test starting\n");
    test(3, 3);
    test(3, 4);
    test(3, 5);

    test(5, 5);
    test(5, 6);
    test(5, 7);
    test(5, 8);
    test(5, 12);
    test(5, 16);

    test(6, 6);
    test(6, 7);
    test(6, 8);
    test(6, 9);
    test(6, 10);
    test(6, 16);

    cudaError_t status = cudaGetLastError();
    if (status != 0) printf("%s\n", cudaGetErrorString(status));
    return status;
}

That code might look more complicated than necessary, but the kernel in the real project should do a lot more computations, which is why I wanted to set up shared memory like that. The output of this code is:

Test starting
input data [0..8] dim=3, threads=3 OK
input data [0..8] dim=3, threads=4 FAIL [0 -2 -4 -6 -8 -10 -12 7 8 ]
input data [0..8] dim=3, threads=5 FAIL [0 -2 -4 -6 -8 -10 6 7 8 ]
input data [0..24] dim=5, threads=5 OK
input data [0..24] dim=5, threads=6 FAIL [0 -2 -4 -6 -8 -10 -12 -14 -16 -18 -20 -22 -24 -26 -28 -30 -32 -34 -36 -38 -40 21 22 23 24 ]
input data [0..24] dim=5, threads=7 FAIL [0 -2 -4 -6 -8 -10 -12 -14 -16 -18 -20 -22 -24 -26 -28 -30 -32 -34 -36 19 20 21 22 23 24 ]
input data [0..24] dim=5, threads=8 FAIL [0 -2 -4 -6 -8 -10 -12 -14 -16 -18 -20 -22 -24 -26 -28 -30 16 17 18 19 20 21 22 23 24 ]
input data [0..24] dim=5, threads=12 FAIL [0 -2 -4 -6 -8 -10 -12 -14 -16 -18 -20 11 12 13 14 15 16 17 18 19 20 21 22 23 24 ]
input data [0..24] dim=5, threads=16 FAIL [0 -2 -4 -6 -8 -10 -12 -14 -16 -18 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 ]
input data [0..35] dim=6, threads=6 OK
input data [0..35] dim=6, threads=7 FAIL [0 -2 -4 -6 -8 -10 -12 -14 -16 -18 -20 -22 -24 -26 -28 -30 -32 -34 -36 -38 -40 -42 -44 -46 -48 -50 -52 -54 28 29 30 31 32 33 34 35 ]
input data [0..35] dim=6, threads=8 FAIL [0 -2 -4 -6 -8 -10 -12 -14 -16 -18 -20 -22 -24 -26 -28 -30 -32 -34 -36 -38 -40 -42 -44 -46 24 25 26 27 28 29 30 31 32 33 34 35 ]
input data [0..35] dim=6, threads=9 FAIL [0 -2 -4 -6 -8 -10 -12 -14 -16 -18 -20 -22 -24 -26 -28 -30 -32 -34 -36 -38 -40 -42 -44 23 24 25 26 27 28 29 30 31 32 33 34 35 ]
input data [0..35] dim=6, threads=10 FAIL [0 -2 -4 -6 -8 -10 -12 -14 -16 -18 -20 -22 -24 -26 -28 -30 -32 -34 -36 -38 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 ]
input data [0..35] dim=6, threads=16 FAIL [0 -2 -4 -6 -8 -10 -12 -14 -16 -18 -20 -22 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 ]

So the problem here is obviously that when there are more threads running than need to be for the matrix manipulation, some values to the end of the data array are returned to global memory before the workMatrix() function has done its job. The more threads there are, the more values are wrong.

So far I could not find a way to get syncronization at the line indicated. Using __syncthreads() does not have any effect. But why is that? In my mind thats what syncronization should be about?

Answer 1

I see two problems, and synchronization is not the main one.

If you have "excess" threads then the threads required to do the workMatrix call will not all lie on the range [0,dim*dim] , so your kernel prevents a number of valid threads from modifying their matrix entry. workMatrix has its own internal guard logic. It should be executed unconditionally
If you wind up with multiple warps running in the kernel, then you need synchronization points after the load to shared memory and before the store back to shared memory.

I would expect something like this to work:

__global__ void kernelTest(int* data, int dim) {
    extern __shared__ int shd[]; //shared array size [dim * dim]
    int** mat = new int* [dim]; //use 2D-indexing into shared array
    for (int i = 0; i < dim; i++) mat[i] = shd + i * dim;
    int idx = blockDim.y * threadIdx.y + threadIdx.x;
    if (idx < dim * dim) {
        shd[idx] = data[idx];
    }
    __syncthreads();
    workMatrix(mat, dim);
    __syncthreads();
    if (idx < dim * dim) {
        data[idx] = shd[idx];
    }
    delete[] mat;
}

[note hacked in browser, never compiled, use at own risk]

cuda shared memory, no synchronisation in kernel, premature output from kernel

Question

1 answers

solution1
0 ACCPTED

cuda shared memory, no synchronisation in kernel, premature output from kernel

Question

1 answers

solution1 0 ACCPTED

solution1
0 ACCPTED