带有共享内存的 CUDA 矩阵转置

Question

I need to implement a matrix transpose function on a GPU using shared memory.我需要使用共享内存在 GPU 上实现矩阵转置函数。 I have done it in a simple way without shared memory which works fine and also an attempt with SM.我以一种简单的方式完成了它，没有共享内存，它工作正常，也是 SM 的尝试。 But unfortunately the calculation is not correct and I can not figure out why.但不幸的是计算不正确，我不知道为什么。 A complete working example can be found here and at the bottom of this question.可以在此处和此问题的底部找到完整的工作示例。

EDIT 1编辑 1

I further know that the first index of the result where I have a wrong value is index 32 (of the flatted matrix, so matr[0][32] in a two dimensional fashion).我还知道我有错误值的结果的第一个索引是索引 32（扁平矩阵的索引，因此matr[0][32]以二维方式显示）。

If there are any more information neede I will prvide them gladly.如果需要更多信息，我会很乐意提供给他们。

A short extract from the entire code which resembles the not working function is listed below:下面列出了整个代码的简短摘录，类似于不起作用的功能：

__global__ void notSoNaivaTransKernel(float *matrB, float *matrA, const int width,
    const int height, const int nreps)
{
    __shared__ float tile[TILE_DIM][TILE_DIM + 1];
    int blockIdx_y = blockIdx.x;
    int blockIdx_x = (blockIdx.x + blockIdx.y) % gridDim.x;
    int xIndex = blockIdx_x * TILE_DIM + threadIdx.x;
    int yIndex = blockIdx_y * TILE_DIM + threadIdx.y;
    int index_in = xIndex + (yIndex)* width;

    xIndex = blockIdx_y * TILE_DIM + threadIdx.x;
    yIndex = blockIdx_x * TILE_DIM + threadIdx.y;
    int index_out = xIndex + (yIndex)* height;

    int r, i;
#pragma unroll
    for (r = 0; r < nreps; r++)
    {
#pragma unroll
        for (i = 0; i < TILE_DIM; i += BLOCK_ROWS)
            tile[threadIdx.y + i][threadIdx.x] = matrA[index_in + i * width];

        __syncthreads();

#pragma unroll
        for (i = 0; i < TILE_DIM; i += BLOCK_ROWS)
            if (index_in + i * width < width * height)
               matrB[index_out + i * height] = tile[threadIdx.x][threadIdx.y + i];
    }
}

The output looks like this:输出如下所示：

Avg. CPU Transpose Time: 0.106048 ms, Bandwidth: 3.771873 GB/s

Avg. GPU Naive Trans Time: 0.009871 ms, bandwidth: 40.520836 GB/s
    Correct: 50000, Wrong: 0

Avg. GPU Trans with SM Time: 0.007598 ms, bandwidth: 52.643482 GB/s
    Correct: 12352, Wrong: 37648

Here is the full working example.这是完整的工作示例。 I stripped most of the unnecessary code from it so it is less stuffed:我从中删除了大部分不必要的代码，因此它不那么塞满：

#include "cuda_runtime.h"
#include "device_functions.h"
#include "device_launch_parameters.h"

#include <chrono>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>

#define TILE_DIM 32
#define BLOCK_ROWS 8
#define BLOCK_COLS 32

cudaError_t matrMagicCuda(float *matrB, float *matrA, const int width, const int height, const int nreps, const int operation);
void cpuMatrTrans(float *matrB, float *matrA, const int width, const int height, const int nreps);
__global__ void naiveTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps);
__global__ void notSoNaivaTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps);

int main()
{
    int i, width, height, nreps, size, wrong, correct;
    double cpuTime, cpuBandwidth;
    cudaError_t cudaStatus;

    float *matrA, *matrATC, *matrATG, *matrAC;

    srand(time(NULL));

    nreps = 10000;
    width = 500;
    height = 100;
    size = width * height;

    matrA = (float*)malloc(size * sizeof(float)); // matrix A
    matrAC = (float*)malloc(size * sizeof(float)); // matrix A copied
    matrATC = (float*)malloc(size * sizeof(float)); // matrix A transposed by CPU
    matrATG = (float*)malloc(size * sizeof(float)); // matrix A transposed by GPU

    for (i = 0; i < size; i++)
    {
        matrA[i] = (float)i;
    }

    auto start = std::chrono::high_resolution_clock::now();

    //CPU Transpose
    cpuMatrTrans(matrATC, matrA, width, height, nreps);

    auto end = std::chrono::high_resolution_clock::now();

    std::chrono::duration<double> diff = end - start;
    cpuTime = (diff.count() * 1000) / nreps;
    cpuBandwidth = (sizeof(float) * size * 2) / (cpuTime * 1000000);//scaling from ms to s and B to GB doen implicitly, shortened in fraction, times two for read and write
    printf("Avg. CPU Transpose Time: %f ms, Bandwidth: %f GB/s\n\n", cpuTime, cpuBandwidth);

    correct = 0;
    wrong = 0;

    //Naive transpose
    matrMagicCuda(matrATG, matrA, width, height, nreps, 1);

    //Check if calc was correct
    for (i = 0; i < size; i++)
    {
        if (matrATC[i] != matrATG[i])
        {
            /*printf("ERROR - %d - ATC:%f - ATG:%f\n\n", i, matrATC[i], matrATG[i]);
            return;*/
            wrong++;
        }
        else
        {
            correct++;
        }
    }

    printf("\tCorrect: %d, Wrong: %d\n\n", correct, wrong);
    correct = 0;
    wrong = 0;

    //Transpose with shared memory
    matrMagicCuda(matrATG, matrA, width, height, nreps, 2);

    //Check if calc was correct
    for (i = 0; i < size; i++)
    {
        if (matrATC[i] != matrATG[i])
        {
            /*printf("ERROR - %d - ATC:%f - ATG:%f\n\n", i, matrATC[i], matrATG[i]);
            return;*/
            wrong++;
        }
        else
        {
            correct++;
        }
    }

    //printf("\tTranspose with SM on GPU was executed correctly.\n\n");
    printf("\tCorrect: %d, Wrong: %d\n\n", correct, wrong);
    correct = 0;
    wrong = 0;

    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaDeviceReset failed!\n");
        return 1;
    }

    return 0;
}

cudaError_t matrMagicCuda(float *matrB, float *matrA, const int width, const int height, const int nreps, const int operation)
{
    float elapsed = 0;
    float *dev_matrA = 0;
    float *dev_matrB = 0;
    cudaError_t cudaStatus;
    dim3 dim_grid, dim_block;
    double gpuBandwidth;

    int size = width * height;

    dim_block.x = TILE_DIM;
    dim_block.y = BLOCK_ROWS;
    dim_block.z = 1;

    dim_grid.x = (width + TILE_DIM - 1) / TILE_DIM;
    dim_grid.y = (height + TILE_DIM - 1) / TILE_DIM;
    dim_grid.z = 1;

    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    // Allocate GPU buffers for three matrix
    cudaStatus = cudaMalloc((void**)&dev_matrA, size * sizeof(float));
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_matrB, size * sizeof(float));
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    // Copy input matrix from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(dev_matrA, matrA, size * sizeof(float), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    switch (operation)
    {
        case(1):
        {
            cudaEventRecord(start);
            // Launch a kernel on the GPU with one thread for each element.
            naiveTransKernel << <dim_grid, dim_block >> >(dev_matrB, dev_matrA, width, height, nreps);

            cudaEventRecord(stop);
            cudaEventSynchronize(stop);

            cudaEventElapsedTime(&elapsed, start, stop);
            cudaEventDestroy(start);
            cudaEventDestroy(stop);

            elapsed /= nreps;

            gpuBandwidth = (sizeof(float) * size * 2) / (elapsed * 1000000);//scaling from ms to s and B to GB doen implicitly, shortened in fraction, times two for read and write
            printf("Avg. GPU Naive Trans Time: %f ms, bandwidth: %f GB/s\n", elapsed, gpuBandwidth);

            break;
        }

        case(2):
        {
            cudaEventRecord(start);
            // Launch a kernel on the GPU with one thread for each element.
            notSoNaivaTransKernel << <dim_grid, dim_block >> >(dev_matrB, dev_matrA, width, height, nreps);

            cudaEventRecord(stop);
            cudaEventSynchronize(stop);

            cudaEventElapsedTime(&elapsed, start, stop);
            cudaEventDestroy(start);
            cudaEventDestroy(stop);

            elapsed /= nreps;

            gpuBandwidth = (sizeof(float) * size * 2) / (elapsed * 1000000);//scaling from ms to s and B to GB doen implicitly, shortened in fraction, times two for read and write
            printf("Avg. GPU Trans with SM Time: %f ms, bandwidth: %f GB/s\n", elapsed, gpuBandwidth);

            break;
        }

    default:
        printf("No matching opcode was found.\n");
    }

    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "Kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }

    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching Kernel!\n", cudaStatus);
        goto Error;
    }

    // Copy output matrix from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(matrB, dev_matrB, size * sizeof(float), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

Error:
    cudaFree(dev_matrB);
    cudaFree(dev_matrA);

    return cudaStatus;
}

void cpuMatrTrans(float *matrB, float *matrA, const int width, const int height, const int nreps)
{
    int i, j, r;

#pragma unroll
    for (r = 0; r < nreps; r++)
#pragma unroll
        for (i = 0; i < height; i++)
#pragma unroll
            for (j = 0; j < width; j++)
                matrB[j * height + i] = matrA[i * width + j];
}

__global__ void naiveTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps)
{
    int i, r;
    int row = blockIdx.x * TILE_DIM + threadIdx.x;
    int col = blockIdx.y * TILE_DIM + threadIdx.y;
    int index_in = row + width * col;
    int index_out = col + height * row;

#pragma unroll
    for (r = 0; r < nreps; r++)
#pragma unroll
        for (i = 0; i < TILE_DIM; i += BLOCK_ROWS)
            if (index_in + i * width < width * height)
                matrB[index_out + i] = matrA[index_in + i * width];
}

__global__ void notSoNaivaTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps)
{
    __shared__ float tile[TILE_DIM][TILE_DIM + 1];
    int blockIdx_y = blockIdx.x;
    int blockIdx_x = (blockIdx.x + blockIdx.y) % gridDim.x;
    int xIndex = blockIdx_x * TILE_DIM + threadIdx.x;
    int yIndex = blockIdx_y * TILE_DIM + threadIdx.y;
    int index_in = xIndex + (yIndex)* width;

    xIndex = blockIdx_y * TILE_DIM + threadIdx.x;
    yIndex = blockIdx_x * TILE_DIM + threadIdx.y;
    int index_out = xIndex + (yIndex)* height;

    int r, i;
#pragma unroll
    for (r = 0; r < nreps; r++)
    {
#pragma unroll
        for (i = 0; i < TILE_DIM; i += BLOCK_ROWS)
            tile[threadIdx.y + i][threadIdx.x] = matrA[index_in + i * width];

        __syncthreads();

#pragma unroll
        for (i = 0; i < TILE_DIM; i += BLOCK_ROWS)
            if (index_in + i * width < width * height)
               matrB[index_out + i * height] = tile[threadIdx.x][threadIdx.y + i];
    }
}

Answer 1

There were a number of issues with this code.这段代码有很多问题。 I'm not sure I will be able to cover all of them.我不确定我是否能够涵盖所有这些。

Possibly the most important issue was your lack of (and lack of understanding of) a proper 2D thread check.可能最重要的问题是您缺乏（并且缺乏理解）适当的 2D 螺纹检查。 Your algorithm creates a grid of threads that is larger in both dimensions than the problem size.您的算法创建了一个线程网格，该网格在两个维度上都大于问题大小。 This creates logical threads outside of the dimension of your matrix, in both dimensions.这会在矩阵维度之外在两个维度中创建逻辑线程。

You have attempted to create a 2D thread check like this:您已尝试像这样创建 2D 线程检查：

        if (index_in + i * width < width * height)

This will not work.这是行不通的。 Suppose I have a 3x3 matrix and a 4x4 grid of threads.假设我有一个 3x3 矩阵和一个 4x4 线程网格。 The thread at (3,0) is clearly out-of-bounds for your matrix, but would pass your 2D thread check. (3,0) 处的线程显然超出了您的矩阵的范围，但会通过您的 2D 线程检查。

In this situation, a proper thread check must test each dimension separately, not as a product.在这种情况下，正确的螺纹检查必须单独测试每个尺寸，而不是作为产品进行测试。

Note that this logical error exists in your "naive" transpose kernel as well, and you can confirm that if you run your code with cuda-memcheck .请注意，此逻辑错误也存在于您的“天真”转置内核中，您可以确认是否使用cuda-memcheck运行您的代码。 It will indicate out-of-bounds access errors in that kernel, even though it appears to work correctly.它会指示该内核中的越界访问错误，即使它看起来工作正常。

There were various other problems as well.还有各种其他问题。 Most of these had to do with indexing in your shared memory kernel.其中大部分与共享内存内核中的索引有关。 It's not clear to me that you understood the necessary indexing manipulation for a shared memory transpose .我不清楚您是否了解共享内存转置的必要索引操作。 There are two separate indexing transposes that we must do in this case:在这种情况下，我们必须执行两个单独的索引转置：

Transpose the block (tile) indices转置块（平铺）索引
Transpose the thread indices转置线程索引

Transposition of the thread indices is done upon reading/writing shared memory.线程索引的转置是在读/写共享内存时完成的。 You have this correctly accounted for with a reversal of the use of threadIdx.x and threadIdx.y for read/write of shared memory.您可以通过反转使用threadIdx.x和threadIdx.y读取/写入共享内存来正确考虑这一点。 But as near as I can tell, your index generation for the reversal of block indices (which reversal is used during the reading/writing to global memory) was broken.但据我所知，用于反转块索引（在读取/写入全局内存期间使用反转）的索引生成被破坏了。 That was the other major issue to be sorted out.这是另一个需要解决的主要问题。

The following code has these and some other issues fixed, and appears to work correctly for me:以下代码修复了这些问题和其他一些问题，并且对我来说似乎可以正常工作：

$ cat t33.cu    
#include <chrono>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>

#define TILE_DIM 32
#define BLOCK_ROWS 8
#define BLOCK_COLS 32

cudaError_t matrMagicCuda(float *matrB, float *matrA, const int width, const int height, const int nreps, const int operation);
void cpuMatrTrans(float *matrB, float *matrA, const int width, const int height, const int nreps);
__global__ void naiveTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps);
__global__ void notSoNaivaTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps);

int main()
{
    int i, width, height, nreps, size, wrong, correct;
    double cpuTime, cpuBandwidth;
    cudaError_t cudaStatus;

    float *matrA, *matrATC, *matrATG, *matrAC;

    srand(time(NULL));

    nreps = 10000;
    width = 500;
    height = 100;


    size = width * height;

    matrA = (float*)malloc(size * sizeof(float)); // matrix A
    matrAC = (float*)malloc(size * sizeof(float)); // matrix A copied
    matrATC = (float*)malloc(size * sizeof(float)); // matrix A transposed by CPU
    matrATG = (float*)malloc(size * sizeof(float)); // matrix A transposed by GPU

    for (i = 0; i < size; i++)
    {
        matrA[i] = (float)i;
    }

    auto start = std::chrono::high_resolution_clock::now();

    //CPU Transpose
    cpuMatrTrans(matrATC, matrA, width, height, nreps);

    auto end = std::chrono::high_resolution_clock::now();

    std::chrono::duration<double> diff = end - start;
    cpuTime = (diff.count() * 1000) / nreps;
    cpuBandwidth = (sizeof(float) * size * 2) / (cpuTime * 1000000);//scaling from ms to s and B to GB doen implicitly, shortened in fraction, times two for read and write
    printf("Avg. CPU Transpose Time: %f ms, Bandwidth: %f GB/s\n\n", cpuTime, cpuBandwidth);

    correct = 0;
    wrong = 0;

    //Naive transpose
    memset(matrATG, 0, size*sizeof(float));
    matrMagicCuda(matrATG, matrA, width, height, nreps, 1);

    //Check if calc was correct
    for (i = 0; i < size; i++)
    {
        if (matrATC[i] != matrATG[i])
        {
            /*printf("ERROR - %d - ATC:%f - ATG:%f\n\n", i, matrATC[i], matrATG[i]);
            return;*/
            wrong++;
        }
        else
        {
            correct++;
        }
    }

    printf("\tCorrect: %d, Wrong: %d\n\n", correct, wrong);
    correct = 0;
    wrong = 0;

    //Transpose with shared memory
    memset(matrATG, 0, size*sizeof(float));
    matrMagicCuda(matrATG, matrA, width, height, nreps, 2);

    //Check if calc was correct
    for (i = 0; i < size; i++)
    {
        if (matrATC[i] != matrATG[i])
        {
            /*printf("ERROR - %d - ATC:%f - ATG:%f\n\n", i, matrATC[i], matrATG[i]);
            return;*/
            wrong++;
        }
        else
        {
            correct++;
        }
    }

    //printf("\tTranspose with SM on GPU was executed correctly.\n\n");
    printf("\tCorrect: %d, Wrong: %d\n\n", correct, wrong);
    correct = 0;
    wrong = 0;

    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaDeviceReset failed!\n");
        return 1;
    }

    return 0;
}

cudaError_t matrMagicCuda(float *matrB, float *matrA, const int width, const int height, const int nreps, const int operation)
{
    float elapsed = 0;
    float *dev_matrA = 0;
    float *dev_matrB = 0;
    cudaError_t cudaStatus;
    dim3 dim_grid, dim_block;
    double gpuBandwidth;

    int size = width * height;

    dim_block.x = TILE_DIM;
    dim_block.y = BLOCK_ROWS;
    dim_block.z = 1;

    dim_grid.x = (width + TILE_DIM - 1) / TILE_DIM;
    dim_grid.y = (height + TILE_DIM - 1) / TILE_DIM;
    dim_grid.z = 1;

    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    // Allocate GPU buffers for three matrix
    cudaStatus = cudaMalloc((void**)&dev_matrA, size * sizeof(float));
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_matrB, size * sizeof(float));
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    // Copy input matrix from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(dev_matrA, matrA, size * sizeof(float), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }
    cudaMemset(dev_matrB, 0, size * sizeof(float));
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    switch (operation)
    {
        case(1):
        {
            cudaEventRecord(start);
            // Launch a kernel on the GPU with one thread for each element.
            naiveTransKernel << <dim_grid, dim_block >> >(dev_matrB, dev_matrA, width, height, nreps);

            cudaEventRecord(stop);
            cudaEventSynchronize(stop);

            cudaEventElapsedTime(&elapsed, start, stop);
            cudaEventDestroy(start);
            cudaEventDestroy(stop);

            elapsed /= nreps;

            gpuBandwidth = (sizeof(float) * size * 2) / (elapsed * 1000000);//scaling from ms to s and B to GB doen implicitly, shortened in fraction, times two for read and write
            printf("Avg. GPU Naive Trans Time: %f ms, bandwidth: %f GB/s\n", elapsed, gpuBandwidth);

            break;
        }

        case(2):
        {
            cudaEventRecord(start);
            // Launch a kernel on the GPU with one thread for each element.
            notSoNaivaTransKernel << <dim_grid, dim_block >> >(dev_matrB, dev_matrA, width, height, nreps);

            cudaEventRecord(stop);
            cudaEventSynchronize(stop);

            cudaEventElapsedTime(&elapsed, start, stop);
            cudaEventDestroy(start);
            cudaEventDestroy(stop);

            elapsed /= nreps;

            gpuBandwidth = (sizeof(float) * size * 2) / (elapsed * 1000000);//scaling from ms to s and B to GB doen implicitly, shortened in fraction, times two for read and write
            printf("Avg. GPU Trans with SM Time: %f ms, bandwidth: %f GB/s\n", elapsed, gpuBandwidth);

            break;
        }

    default:
        printf("No matching opcode was found.\n");
    }

    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "Kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }

    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching Kernel!\n", cudaStatus);
        goto Error;
    }

    // Copy output matrix from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(matrB, dev_matrB, size * sizeof(float), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

Error:
    cudaFree(dev_matrB);
    cudaFree(dev_matrA);

    return cudaStatus;
}

void cpuMatrTrans(float *matrB, float *matrA, const int width, const int height, const int nreps)
{
    int i, j, r;

#pragma unroll
    for (r = 0; r < nreps; r++)
#pragma unroll
        for (i = 0; i < height; i++)
#pragma unroll
            for (j = 0; j < width; j++)
                matrB[j * height + i] = matrA[i * width + j];
}

__global__ void naiveTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps)
{
    int i, r;
    int col = blockIdx.x * TILE_DIM + threadIdx.x;
    int row = blockIdx.y * TILE_DIM + threadIdx.y;
    int index_in = col + width * row;
    int index_out = row + height * col;

#pragma unroll
    for (r = 0; r < nreps; r++)
#pragma unroll
        for (i = 0; i < TILE_DIM; i += BLOCK_ROWS)
            if ((row+i<height) && (col < width))
                matrB[index_out + i] = matrA[index_in + i * width];
}

__global__ void notSoNaivaTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps)
{
    __shared__ float tile[TILE_DIM][TILE_DIM + 1];
    int ciIndex = blockIdx.x * TILE_DIM + threadIdx.x;
    int riIndex = blockIdx.y * TILE_DIM + threadIdx.y;
    int coIndex = blockIdx.y * TILE_DIM + threadIdx.x;
    int roIndex = blockIdx.x * TILE_DIM + threadIdx.y;
    int index_in = ciIndex + (riIndex)* width;
    int index_out = coIndex + (roIndex)* height;


    int r, i;
#pragma unroll
    for (r = 0; r < nreps; r++)
    {
#pragma unroll
        for (i = 0; i < TILE_DIM; i += BLOCK_ROWS)
            if ((ciIndex<width) && (riIndex+i < height))
              tile[threadIdx.y + i][threadIdx.x] = matrA[index_in + i * width];
        __syncthreads();

#pragma unroll
        for (i = 0; i < TILE_DIM; i += BLOCK_ROWS)
            if ((coIndex<height) && (roIndex+i < width))
               matrB[index_out + i*height] = tile[threadIdx.x][threadIdx.y + i];
        __syncthreads();
    }
}
$ nvcc -std=c++11 -arch=sm_61 -o t33 t33.cu
t33.cu(25): warning: variable "matrAC" was set but never used

t33.cu(25): warning: variable "matrAC" was set but never used

$ cuda-memcheck ./t33
========= CUDA-MEMCHECK
Avg. CPU Transpose Time: 0.143087 ms, Bandwidth: 2.795509 GB/s

Avg. GPU Naive Trans Time: 0.028587 ms, bandwidth: 13.992195 GB/s
        Correct: 50000, Wrong: 0

Avg. GPU Trans with SM Time: 0.040328 ms, bandwidth: 9.918678 GB/s
        Correct: 50000, Wrong: 0

========= ERROR SUMMARY: 0 errors
$ ./t33
Avg. CPU Transpose Time: 0.140469 ms, Bandwidth: 2.847594 GB/s

Avg. GPU Naive Trans Time: 0.003828 ms, bandwidth: 104.505440 GB/s
        Correct: 50000, Wrong: 0

Avg. GPU Trans with SM Time: 0.000715 ms, bandwidth: 559.206604 GB/s
        Correct: 50000, Wrong: 0

$

Note: The code attempts to measure bandwidth.注意：该代码尝试测量带宽。 However, you should be aware that the measured bandwidth here is affected by the cache bandwidth.但是，您应该知道，这里测量的带宽受缓存带宽的影响。 Your matrix sizes (500x100 = 200Kbytes each for input and output) here are easily small enough to fit in the L2 cache on most GPUs.您的矩阵大小（输入和输出各为 500x100 = 200Kbytes）很容易小到足以放入大多数 GPU 上的 L2 缓存。 This fact, coupled with the fact that you are running the same transpose multiple times ( nreps ) means that much of the work is operating directly out of the L2 cache.这一事实，再加上您多次运行相同的转置 ( nreps ) 的事实意味着大部分工作直接在 L2 缓存之外运行。 Therefore, in the "optimized" case above, we see a reported bandwidth number that greatly exceeds the available memory bandwidth of the GPU (this case happens to be a Pascal Titan X, so about ~340GB/s of main memory bandwidth available).因此，在上面的“优化”案例中，我们看到报告的带宽数量大大超过了 GPU 的可用内存带宽（这种情况恰好是 Pascal Titan X，因此大约有大约 340GB/s 的可用主内存带宽）。 This is due to the fact that this measurement includes some benefit from the L2 cache, whose bandwidth is at least twice as high as main memory bandwidth.这是因为该测量包括 L2 缓存的一些好处，其带宽至少是主内存带宽的两倍。 You can eliminate this effect by using a significantly larger matrix size and/or reducing nreps to 1.您可以通过使用更大的矩阵大小和/或将nreps减少到 1 来消除这种影响。

带有共享内存的 CUDA 矩阵转置

问题描述

1 个解决方案

解决方案1
2 已采纳 2016-11-23 03:58:48

带有共享内存的 CUDA 矩阵转置

问题描述

1 个解决方案

解决方案1 2 已采纳 2016-11-23 03:58:48

解决方案1
2 已采纳 2016-11-23 03:58:48