cuda shared memory, no synchronisation in kernel, premature output from kernel

Question

经过数小时缩小另一个项目的观察范围后，我想出了以下 cuda 代码：

#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

void __syncthreads();

int* cudata;

__device__ void workMatrix(int** mat, int dim) {
    int r = threadIdx.y; //row index into matrix
    int c = threadIdx.x; //column index into matrix
    if (r < dim && c < dim) mat[r][c] *= -2;
}

__global__ void kernelTest(int* data, int dim) {
    extern __shared__ int shd[]; //shared array size [dim * dim]
    int** mat = new int* [dim]; //use 2D-indexing into shared array
    for (int i = 0; i < dim; i++) mat[i] = shd + i * dim;
    int idx = blockDim.y * threadIdx.y + threadIdx.x;
    if (idx < dim * dim) {
        shd[idx] = data[idx];
        workMatrix(mat, dim);
    }
    __syncthreads(); //DOES NOT HAVE ANY EFFECT, HOW TO SYNCHRONIZE HERE???
    if (idx < dim * dim) {
        data[idx] = shd[idx];
    }
    delete[] mat;
}

void test(int dim, int threads) {
    //setup input array
    int siz = dim * dim;
    int* data = new int[siz];
    for (int i = 0; i < siz; i++) data[i] = i;
    printf("input data [%d..%d] ", data[0], data[siz - 1]);
    //copy data to device
    cudaMalloc(&cudata, siz * sizeof(int));
    cudaMemcpy(cudata, data, siz * sizeof(int), cudaMemcpyDefault);
    //run kernel
    dim3 thr(threads, threads);
    kernelTest <<<1, thr, siz * sizeof(int) >>> (cudata, dim);
    cudaDeviceSynchronize();
    //return data to host
    int* returnData = new int[siz];
    cudaMemcpy(returnData, cudata, siz * sizeof(int), cudaMemcpyDefault);
    //analyse and print results
    bool ok = true;
    for (int i = 0; i < siz; i++) ok &= (returnData[i] == data[i] * -2);
    printf("dim=%d, threads=%d ", dim, threads);
    if (ok) {
        printf("OK\n");

    } else {
        printf("FAIL [");
        for (int i = 0; i < siz; i++) printf("%d ", returnData[i]);
        printf("]\n");
    }
    //clear up memory
    cudaFree(cudata);
    delete[] data;
    delete[] returnData;
}

int main() {
    printf("Test starting\n");
    test(3, 3);
    test(3, 4);
    test(3, 5);

    test(5, 5);
    test(5, 6);
    test(5, 7);
    test(5, 8);
    test(5, 12);
    test(5, 16);

    test(6, 6);
    test(6, 7);
    test(6, 8);
    test(6, 9);
    test(6, 10);
    test(6, 16);

    cudaError_t status = cudaGetLastError();
    if (status != 0) printf("%s\n", cudaGetErrorString(status));
    return status;
}

该代码可能看起来比必要的复杂，但实际项目中的 kernel 应该做更多的计算，这就是为什么我想像这样设置共享 memory。 该代码的output为：

Test starting
input data [0..8] dim=3, threads=3 OK
input data [0..8] dim=3, threads=4 FAIL [0 -2 -4 -6 -8 -10 -12 7 8 ]
input data [0..8] dim=3, threads=5 FAIL [0 -2 -4 -6 -8 -10 6 7 8 ]
input data [0..24] dim=5, threads=5 OK
input data [0..24] dim=5, threads=6 FAIL [0 -2 -4 -6 -8 -10 -12 -14 -16 -18 -20 -22 -24 -26 -28 -30 -32 -34 -36 -38 -40 21 22 23 24 ]
input data [0..24] dim=5, threads=7 FAIL [0 -2 -4 -6 -8 -10 -12 -14 -16 -18 -20 -22 -24 -26 -28 -30 -32 -34 -36 19 20 21 22 23 24 ]
input data [0..24] dim=5, threads=8 FAIL [0 -2 -4 -6 -8 -10 -12 -14 -16 -18 -20 -22 -24 -26 -28 -30 16 17 18 19 20 21 22 23 24 ]
input data [0..24] dim=5, threads=12 FAIL [0 -2 -4 -6 -8 -10 -12 -14 -16 -18 -20 11 12 13 14 15 16 17 18 19 20 21 22 23 24 ]
input data [0..24] dim=5, threads=16 FAIL [0 -2 -4 -6 -8 -10 -12 -14 -16 -18 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 ]
input data [0..35] dim=6, threads=6 OK
input data [0..35] dim=6, threads=7 FAIL [0 -2 -4 -6 -8 -10 -12 -14 -16 -18 -20 -22 -24 -26 -28 -30 -32 -34 -36 -38 -40 -42 -44 -46 -48 -50 -52 -54 28 29 30 31 32 33 34 35 ]
input data [0..35] dim=6, threads=8 FAIL [0 -2 -4 -6 -8 -10 -12 -14 -16 -18 -20 -22 -24 -26 -28 -30 -32 -34 -36 -38 -40 -42 -44 -46 24 25 26 27 28 29 30 31 32 33 34 35 ]
input data [0..35] dim=6, threads=9 FAIL [0 -2 -4 -6 -8 -10 -12 -14 -16 -18 -20 -22 -24 -26 -28 -30 -32 -34 -36 -38 -40 -42 -44 23 24 25 26 27 28 29 30 31 32 33 34 35 ]
input data [0..35] dim=6, threads=10 FAIL [0 -2 -4 -6 -8 -10 -12 -14 -16 -18 -20 -22 -24 -26 -28 -30 -32 -34 -36 -38 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 ]
input data [0..35] dim=6, threads=16 FAIL [0 -2 -4 -6 -8 -10 -12 -14 -16 -18 -20 -22 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 ]

所以这里的问题很明显，当运行的线程多于矩阵操作所需的线程时，数据数组末尾的一些值在workMatrix() function 完成工作之前返回到全局 memory 。 线程越多，错误的值就越多。

到目前为止，我找不到在指示的行上进行同步的方法。 使用__syncthreads()没有任何效果。 但这是为什么呢？ 在我看来，同步应该是什么？

Answer 1

我看到两个问题，同步不是主要问题。

如果您有“多余”线程，那么执行workMatrix调用所需的线程将不会全部位于[0,dim*dim]范围内，因此您的 kernel 会阻止许多有效线程修改其矩阵条目。 workMatrix有自己的内部守卫逻辑。 应该无条件执行
如果您最终在 kernel 中运行多个 warp，那么您需要在加载到共享 memory 之后以及在存储返回到共享 memory 之前同步点。

我希望这样的事情能够奏效：

__global__ void kernelTest(int* data, int dim) {
    extern __shared__ int shd[]; //shared array size [dim * dim]
    int** mat = new int* [dim]; //use 2D-indexing into shared array
    for (int i = 0; i < dim; i++) mat[i] = shd + i * dim;
    int idx = blockDim.y * threadIdx.y + threadIdx.x;
    if (idx < dim * dim) {
        shd[idx] = data[idx];
    }
    __syncthreads();
    workMatrix(mat, dim);
    __syncthreads();
    if (idx < dim * dim) {
        data[idx] = shd[idx];
    }
    delete[] mat;
}

[注意在浏览器中被黑，从未编译，使用风险自负]

cuda shared memory, no synchronisation in kernel, premature output from kernel

问题描述

1 个解决方案

解决方案1
0 已采纳

cuda shared memory, no synchronisation in kernel, premature output from kernel

问题描述

1 个解决方案

解决方案1 0 已采纳

解决方案1
0 已采纳