After hours of narrowing down an observation in another project I came up with the following cuda code:
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
void __syncthreads();
int* cudata;
__device__ void workMatrix(int** mat, int dim) {
int r = threadIdx.y; //row index into matrix
int c = threadIdx.x; //column index into matrix
if (r < dim && c < dim) mat[r][c] *= -2;
}
__global__ void kernelTest(int* data, int dim) {
extern __shared__ int shd[]; //shared array size [dim * dim]
int** mat = new int* [dim]; //use 2D-indexing into shared array
for (int i = 0; i < dim; i++) mat[i] = shd + i * dim;
int idx = blockDim.y * threadIdx.y + threadIdx.x;
if (idx < dim * dim) {
shd[idx] = data[idx];
workMatrix(mat, dim);
}
__syncthreads(); //DOES NOT HAVE ANY EFFECT, HOW TO SYNCHRONIZE HERE???
if (idx < dim * dim) {
data[idx] = shd[idx];
}
delete[] mat;
}
void test(int dim, int threads) {
//setup input array
int siz = dim * dim;
int* data = new int[siz];
for (int i = 0; i < siz; i++) data[i] = i;
printf("input data [%d..%d] ", data[0], data[siz - 1]);
//copy data to device
cudaMalloc(&cudata, siz * sizeof(int));
cudaMemcpy(cudata, data, siz * sizeof(int), cudaMemcpyDefault);
//run kernel
dim3 thr(threads, threads);
kernelTest <<<1, thr, siz * sizeof(int) >>> (cudata, dim);
cudaDeviceSynchronize();
//return data to host
int* returnData = new int[siz];
cudaMemcpy(returnData, cudata, siz * sizeof(int), cudaMemcpyDefault);
//analyse and print results
bool ok = true;
for (int i = 0; i < siz; i++) ok &= (returnData[i] == data[i] * -2);
printf("dim=%d, threads=%d ", dim, threads);
if (ok) {
printf("OK\n");
} else {
printf("FAIL [");
for (int i = 0; i < siz; i++) printf("%d ", returnData[i]);
printf("]\n");
}
//clear up memory
cudaFree(cudata);
delete[] data;
delete[] returnData;
}
int main() {
printf("Test starting\n");
test(3, 3);
test(3, 4);
test(3, 5);
test(5, 5);
test(5, 6);
test(5, 7);
test(5, 8);
test(5, 12);
test(5, 16);
test(6, 6);
test(6, 7);
test(6, 8);
test(6, 9);
test(6, 10);
test(6, 16);
cudaError_t status = cudaGetLastError();
if (status != 0) printf("%s\n", cudaGetErrorString(status));
return status;
}
That code might look more complicated than necessary, but the kernel in the real project should do a lot more computations, which is why I wanted to set up shared memory like that. The output of this code is:
Test starting
input data [0..8] dim=3, threads=3 OK
input data [0..8] dim=3, threads=4 FAIL [0 -2 -4 -6 -8 -10 -12 7 8 ]
input data [0..8] dim=3, threads=5 FAIL [0 -2 -4 -6 -8 -10 6 7 8 ]
input data [0..24] dim=5, threads=5 OK
input data [0..24] dim=5, threads=6 FAIL [0 -2 -4 -6 -8 -10 -12 -14 -16 -18 -20 -22 -24 -26 -28 -30 -32 -34 -36 -38 -40 21 22 23 24 ]
input data [0..24] dim=5, threads=7 FAIL [0 -2 -4 -6 -8 -10 -12 -14 -16 -18 -20 -22 -24 -26 -28 -30 -32 -34 -36 19 20 21 22 23 24 ]
input data [0..24] dim=5, threads=8 FAIL [0 -2 -4 -6 -8 -10 -12 -14 -16 -18 -20 -22 -24 -26 -28 -30 16 17 18 19 20 21 22 23 24 ]
input data [0..24] dim=5, threads=12 FAIL [0 -2 -4 -6 -8 -10 -12 -14 -16 -18 -20 11 12 13 14 15 16 17 18 19 20 21 22 23 24 ]
input data [0..24] dim=5, threads=16 FAIL [0 -2 -4 -6 -8 -10 -12 -14 -16 -18 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 ]
input data [0..35] dim=6, threads=6 OK
input data [0..35] dim=6, threads=7 FAIL [0 -2 -4 -6 -8 -10 -12 -14 -16 -18 -20 -22 -24 -26 -28 -30 -32 -34 -36 -38 -40 -42 -44 -46 -48 -50 -52 -54 28 29 30 31 32 33 34 35 ]
input data [0..35] dim=6, threads=8 FAIL [0 -2 -4 -6 -8 -10 -12 -14 -16 -18 -20 -22 -24 -26 -28 -30 -32 -34 -36 -38 -40 -42 -44 -46 24 25 26 27 28 29 30 31 32 33 34 35 ]
input data [0..35] dim=6, threads=9 FAIL [0 -2 -4 -6 -8 -10 -12 -14 -16 -18 -20 -22 -24 -26 -28 -30 -32 -34 -36 -38 -40 -42 -44 23 24 25 26 27 28 29 30 31 32 33 34 35 ]
input data [0..35] dim=6, threads=10 FAIL [0 -2 -4 -6 -8 -10 -12 -14 -16 -18 -20 -22 -24 -26 -28 -30 -32 -34 -36 -38 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 ]
input data [0..35] dim=6, threads=16 FAIL [0 -2 -4 -6 -8 -10 -12 -14 -16 -18 -20 -22 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 ]
So the problem here is obviously that when there are more threads running than need to be for the matrix manipulation, some values to the end of the data array are returned to global memory before the workMatrix()
function has done its job. The more threads there are, the more values are wrong.
So far I could not find a way to get syncronization at the line indicated. Using __syncthreads()
does not have any effect. But why is that? In my mind thats what syncronization should be about?
I see two problems, and synchronization is not the main one.
workMatrix
call will not all lie on the range [0,dim*dim]
, so your kernel prevents a number of valid threads from modifying their matrix entry. workMatrix
has its own internal guard logic. It should be executed unconditionallyI would expect something like this to work:
__global__ void kernelTest(int* data, int dim) {
extern __shared__ int shd[]; //shared array size [dim * dim]
int** mat = new int* [dim]; //use 2D-indexing into shared array
for (int i = 0; i < dim; i++) mat[i] = shd + i * dim;
int idx = blockDim.y * threadIdx.y + threadIdx.x;
if (idx < dim * dim) {
shd[idx] = data[idx];
}
__syncthreads();
workMatrix(mat, dim);
__syncthreads();
if (idx < dim * dim) {
data[idx] = shd[idx];
}
delete[] mat;
}
[note hacked in browser, never compiled, use at own risk]
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.