cuda shared memory, no synchronisation in kernel, premature output from kernel

Question

經過數小時縮小另一個項目的觀察范圍后，我想出了以下 cuda 代碼：

#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

void __syncthreads();

int* cudata;

__device__ void workMatrix(int** mat, int dim) {
    int r = threadIdx.y; //row index into matrix
    int c = threadIdx.x; //column index into matrix
    if (r < dim && c < dim) mat[r][c] *= -2;
}

__global__ void kernelTest(int* data, int dim) {
    extern __shared__ int shd[]; //shared array size [dim * dim]
    int** mat = new int* [dim]; //use 2D-indexing into shared array
    for (int i = 0; i < dim; i++) mat[i] = shd + i * dim;
    int idx = blockDim.y * threadIdx.y + threadIdx.x;
    if (idx < dim * dim) {
        shd[idx] = data[idx];
        workMatrix(mat, dim);
    }
    __syncthreads(); //DOES NOT HAVE ANY EFFECT, HOW TO SYNCHRONIZE HERE???
    if (idx < dim * dim) {
        data[idx] = shd[idx];
    }
    delete[] mat;
}

void test(int dim, int threads) {
    //setup input array
    int siz = dim * dim;
    int* data = new int[siz];
    for (int i = 0; i < siz; i++) data[i] = i;
    printf("input data [%d..%d] ", data[0], data[siz - 1]);
    //copy data to device
    cudaMalloc(&cudata, siz * sizeof(int));
    cudaMemcpy(cudata, data, siz * sizeof(int), cudaMemcpyDefault);
    //run kernel
    dim3 thr(threads, threads);
    kernelTest <<<1, thr, siz * sizeof(int) >>> (cudata, dim);
    cudaDeviceSynchronize();
    //return data to host
    int* returnData = new int[siz];
    cudaMemcpy(returnData, cudata, siz * sizeof(int), cudaMemcpyDefault);
    //analyse and print results
    bool ok = true;
    for (int i = 0; i < siz; i++) ok &= (returnData[i] == data[i] * -2);
    printf("dim=%d, threads=%d ", dim, threads);
    if (ok) {
        printf("OK\n");

    } else {
        printf("FAIL [");
        for (int i = 0; i < siz; i++) printf("%d ", returnData[i]);
        printf("]\n");
    }
    //clear up memory
    cudaFree(cudata);
    delete[] data;
    delete[] returnData;
}

int main() {
    printf("Test starting\n");
    test(3, 3);
    test(3, 4);
    test(3, 5);

    test(5, 5);
    test(5, 6);
    test(5, 7);
    test(5, 8);
    test(5, 12);
    test(5, 16);

    test(6, 6);
    test(6, 7);
    test(6, 8);
    test(6, 9);
    test(6, 10);
    test(6, 16);

    cudaError_t status = cudaGetLastError();
    if (status != 0) printf("%s\n", cudaGetErrorString(status));
    return status;
}

該代碼可能看起來比必要的復雜，但實際項目中的 kernel 應該做更多的計算，這就是為什么我想像這樣設置共享 memory。 該代碼的output為：

Test starting
input data [0..8] dim=3, threads=3 OK
input data [0..8] dim=3, threads=4 FAIL [0 -2 -4 -6 -8 -10 -12 7 8 ]
input data [0..8] dim=3, threads=5 FAIL [0 -2 -4 -6 -8 -10 6 7 8 ]
input data [0..24] dim=5, threads=5 OK
input data [0..24] dim=5, threads=6 FAIL [0 -2 -4 -6 -8 -10 -12 -14 -16 -18 -20 -22 -24 -26 -28 -30 -32 -34 -36 -38 -40 21 22 23 24 ]
input data [0..24] dim=5, threads=7 FAIL [0 -2 -4 -6 -8 -10 -12 -14 -16 -18 -20 -22 -24 -26 -28 -30 -32 -34 -36 19 20 21 22 23 24 ]
input data [0..24] dim=5, threads=8 FAIL [0 -2 -4 -6 -8 -10 -12 -14 -16 -18 -20 -22 -24 -26 -28 -30 16 17 18 19 20 21 22 23 24 ]
input data [0..24] dim=5, threads=12 FAIL [0 -2 -4 -6 -8 -10 -12 -14 -16 -18 -20 11 12 13 14 15 16 17 18 19 20 21 22 23 24 ]
input data [0..24] dim=5, threads=16 FAIL [0 -2 -4 -6 -8 -10 -12 -14 -16 -18 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 ]
input data [0..35] dim=6, threads=6 OK
input data [0..35] dim=6, threads=7 FAIL [0 -2 -4 -6 -8 -10 -12 -14 -16 -18 -20 -22 -24 -26 -28 -30 -32 -34 -36 -38 -40 -42 -44 -46 -48 -50 -52 -54 28 29 30 31 32 33 34 35 ]
input data [0..35] dim=6, threads=8 FAIL [0 -2 -4 -6 -8 -10 -12 -14 -16 -18 -20 -22 -24 -26 -28 -30 -32 -34 -36 -38 -40 -42 -44 -46 24 25 26 27 28 29 30 31 32 33 34 35 ]
input data [0..35] dim=6, threads=9 FAIL [0 -2 -4 -6 -8 -10 -12 -14 -16 -18 -20 -22 -24 -26 -28 -30 -32 -34 -36 -38 -40 -42 -44 23 24 25 26 27 28 29 30 31 32 33 34 35 ]
input data [0..35] dim=6, threads=10 FAIL [0 -2 -4 -6 -8 -10 -12 -14 -16 -18 -20 -22 -24 -26 -28 -30 -32 -34 -36 -38 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 ]
input data [0..35] dim=6, threads=16 FAIL [0 -2 -4 -6 -8 -10 -12 -14 -16 -18 -20 -22 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 ]

所以這里的問題很明顯，當運行的線程多於矩陣操作所需的線程時，數據數組末尾的一些值在workMatrix() function 完成工作之前返回到全局 memory 。 線程越多，錯誤的值就越多。

到目前為止，我找不到在指示的行上進行同步的方法。 使用__syncthreads()沒有任何效果。 但這是為什么呢？ 在我看來，同步應該是什么？

Answer 1

我看到兩個問題，同步不是主要問題。

如果您有“多余”線程，那么執行workMatrix調用所需的線程將不會全部位於[0,dim*dim]范圍內，因此您的 kernel 會阻止許多有效線程修改其矩陣條目。 workMatrix有自己的內部守衛邏輯。 應該無條件執行
如果您最終在 kernel 中運行多個 warp，那么您需要在加載到共享 memory 之后以及在存儲返回到共享 memory 之前同步點。

我希望這樣的事情能夠奏效：

__global__ void kernelTest(int* data, int dim) {
    extern __shared__ int shd[]; //shared array size [dim * dim]
    int** mat = new int* [dim]; //use 2D-indexing into shared array
    for (int i = 0; i < dim; i++) mat[i] = shd + i * dim;
    int idx = blockDim.y * threadIdx.y + threadIdx.x;
    if (idx < dim * dim) {
        shd[idx] = data[idx];
    }
    __syncthreads();
    workMatrix(mat, dim);
    __syncthreads();
    if (idx < dim * dim) {
        data[idx] = shd[idx];
    }
    delete[] mat;
}

[注意在瀏覽器中被黑，從未編譯，使用風險自負]

cuda shared memory, no synchronisation in kernel, premature output from kernel

問題描述

1 個解決方案

解決方案1
0 已采納

cuda shared memory, no synchronisation in kernel, premature output from kernel

問題描述

1 個解決方案

解決方案1 0 已采納

解決方案1
0 已采納