如何使用cuda对沿行方向的巨大2D矩阵进行约简？（每一行的最大值和最大值的索引）

Question

I'm trying to implement a reduction along the row direction of a 2D matrix. 我正在尝试沿着2D矩阵的行方向实现缩小。 I'm starting from a code I found on stackoverflow (thanks a lot Robert!) 我从在stackoverflow上找到的代码开始（非常感谢Robert！）

thrust::max_element slow in comparison cublasIsamax - More efficient implementation? 推力::: max_element比较起来慢cublasIsamax-更有效的实现？

The above link shows a custom kernel that performs reduction on a single row. 上面的链接显示了一个自定义内核，该内核对单个行执行归约。 It divides the input row into many rows and each row has 1024 threads. 它将输入行分为许多行，每行有1024个线程。 Works very well. 效果很好。

For the 2D case, everything's the same except that now there's ay grid dimension. 对于2D情况，除了现在有了ay网格尺寸之外，其他所有内容都相同。 So each block's y dimension is still 1. The problem is that when I try to write data onto the shared memory within each block (within the "max_idx_kernel_reduction_within_block" kernel in the code), It takes very long (More than (# of Rows) * (Time it takes to perform reduction on 1 Row. I would rather run a for loop). I know I have a lot of elements but I was expecting something faster than that. 因此，每个块的y维仍为1。问题是，当我尝试将数据写入每个块内的共享内存中（在代码中的“ max_idx_kernel_reduction_within_block”内核中）时，它花费的时间非常长（大于（行数） *（在1行上执行还原操作所花费的时间。我宁愿运行for循环）。我知道我有很多元素，但是我期望有比这更快的东西。

I don't think the memory access pattern is an issue, but I heard that the TOTAL amount of shared memory might be the limitation?? 我认为内存访问模式不是问题，但是我听说共享内存的总数可能是限制？ : CUDA: Is coalesced global memory access faster than shared memory? ： CUDA：合并的全局内存访问是否比共享内存快？ Also, does allocating a large shared memory array slow down the program? 另外，分配大的共享内存阵列是否会降低程序速度？

Any suggestions to make my code faster (the first kernel is the bottleneck)? 有什么建议可以使我的代码更快（第一个内核是瓶颈）？ Thank you very much, very much appreciated!! 非常感谢，非常感谢！

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <iostream>
#include <cuda_runtime.h>

#define NCOLS 163317 // number of columns
#define NROWS 8 // number of rows
#define nTPB 1024  // Threads per Block. nTPB should be a power-of-2
#define MAX_BLOCKS_X ((NCOLS/nTPB)+1) // # of blocks I will launch

#define MIN(a,b) ((a>b)?b:a)
#define FLOAT_MIN -1.0f // lowest anticipated number of the data. Values in array will be compared with this and updated with the larger one
#define IDX2F(i,j,ld) ((j-1) * (ld) + ((i) - 1))  // 1 based indexing
#define IDX2C(i,j,ld) ((j) * (ld) + i)  // 0 based indexing


__device__ volatile float blk_vals[NROWS][MAX_BLOCKS_X];
__device__ volatile int   blk_idxs[NROWS][MAX_BLOCKS_X];
// blk_vals and blk_idxs are the results obtained from reduction within each block.
// after 1st reduction, each row will have blk_vals[MAX_BLOCKS_X] array and blk_idxs[MAX_BLOCKS_X]
// these will be passed to the 2nd kernel

__global__ void max_idx_kernel_reduction_within_block(const float *data, const int xSize, const int ySize){  // first kernel. Reduction within blocks
  __shared__ volatile float vals[nTPB]; // Total amount of shared memory per block: 49152 bytes (50 KB). 1024 gives ~ 4KB for single.
  __shared__ volatile int idxs[nTPB]; // ~ 4 KB for single, when nTPB is 1024. each block will have both indices and values

  int idx = threadIdx.x+blockDim.x * blockIdx.x; // idx in the x direction
  float my_val = FLOAT_MIN; // lowest possible number
  int my_idx = -1;  // to check whether you DID perform the kernel. Again, it's the idx in the x dir.

  // sweep from global memory
  while (idx < xSize){   // this ensures you don't go out the size of the array's x direction
    if (data[IDX2C(blockIdx.y,idx,ySize)] > my_val) {my_val = data[IDX2C(blockIdx.y,idx,ySize)]; my_idx = idx;}
    // compare with my_val, and put the bigger value into my_val for next comparison. my_idx is 0 index based
    idx += blockDim.x*gridDim.x;}
                                                                 // until here takes about 6 ms !! very fast!!

  // populate shared memory: takes ~ 270 ms
  vals[threadIdx.x] = my_val;  // put the computed max value for each thread into the shared memory. -> this is the bottleneck!!
  idxs[threadIdx.x] = my_idx;  // do this for index as well -> this is also slow!!




  __syncthreads();


  // sweep in shared memory
  for (int i = (nTPB>>1); i > 0; i>>=1){
    if (threadIdx.x < i)    // the first half threads of the block
      if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
                            // the above is comparing shared memory of threadIdx.x with shared memory of threadIdx.x + i.
                            // then puts the larger value into shared memory of threadIdx.x
    __syncthreads();}       // so now in each block, shared memory's first element (index 0) is the max value and max value index


  // perform block-level reduction
  if (!threadIdx.x){    // at the shared memory, only the first element (index 0) (actually 2 elements in the first index. max value, and max value index) is what we need.
      blk_vals[blockIdx.y][blockIdx.x] = vals[0]; // For each window (single x row), the first elements of the blocks are stored into the blk_vals[windowNumber][:]
                                                // remember, this is a global variable.
    blk_idxs[blockIdx.y][blockIdx.x] = idxs[0]; // and the max value index

  __syncthreads();
}

}







  // originally the following kernel was in the 1st kernel, performed by the last block. So just use one block for this.
__global__ void max_idx_kernel_final(int *result_maxInd, float *result_maxVal){

  __shared__ volatile float vals[nTPB]; //  Total amount of shared memory per block: 49152 bytes (50 KB). 1024 gives ~ 4KB for single.
  __shared__ volatile int idxs[nTPB]; // ~ 4 KB for single, when nTPB is 1024. each block will have these variables!! (vals and idxs)

  int idx = threadIdx.x;
  float my_val = FLOAT_MIN;
  int my_idx = -1;  // remember, these are local variables, so each thread has this variable. This local variable is independent from other thread's local variable
  while (idx < MAX_BLOCKS_X ){                                                          // ?? confused whether it should be gridDim.x (actual # of blocks launched) or MAX_BLOCKS_X (# of elements in x dir of the global array blk_vals)
    if (blk_vals[blockIdx.y][idx] > my_val)
        {my_val = blk_vals[blockIdx.y][idx]; my_idx = blk_idxs[blockIdx.y][idx]; }
    idx += blockDim.x;} // all threads in this single block (single in the x dir) are working, so you should loop over blockDim.x.
                      // Imagine where gridDim.x (# of blocks) is huge so that you need to loop over to get the max value and index
                      // After this, each thread in the block has a local variable (max value and max value index).
                      // So far it was sort of a reduction, but instead of pairing values we just looped over the blk_vals and blk_idxs
  // populate shared memory
  vals[threadIdx.x] = my_val;   // This is now shared memory. This is because reduction requires comparison between different elements
  idxs[threadIdx.x] = my_idx;   // my_idx value is 0 based. This is done for all blocks (in the y direction)
  __syncthreads();
  // Now the final task is to do reduction for all threads in our single block (single block in the x dir, NROWS blocks in the y dir)!

// sweep in shared memory
  for (int i = (nTPB>>1); i > 0; i>>=1) {
    if (threadIdx.x < i) // the first half threads of the block
      if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
    __syncthreads();} // now all the results are in threadIdx.x == 0 for each block (there are NROWS blocks in the y dir)
  // 0th thread. the results are in shared memory, not the local memory, so any thread could do the following. We just selected the 0th thread for no reason. If several threads try to do this, that would be a problem, since we'll have to wait for them

  if(!threadIdx.x){
        result_maxInd[blockIdx.y] = idxs[0]; // the final result for each row goes into the corresponding position (blockIdx.y)
        result_maxVal[blockIdx.y] = vals[0];
      }
}






  int main(){

    dim3 grids(MAX_BLOCKS_X, NROWS);
    dim3 threads(nTPB,1);
    dim3 grids2(1,NROWS);
    dim3 threads2(nTPB);
    float *d_vector, *h_vector;
    h_vector = (float*)malloc(NROWS * NCOLS * sizeof(float));


    for (int j = 1; j <= NCOLS; j++) {
      for (int i = 1; i <= NROWS; i++)  {
          h_vector[IDX2F(i,j,NROWS)] = (float) (rand()/(float)RAND_MAX);
      }
    }



    h_vector[IDX2F(2,5,NROWS)] = 10;  // create definite max element
    cudaMalloc(&d_vector, NROWS * NCOLS * sizeof(float));
    cudaMemcpy(d_vector, h_vector, NROWS * NCOLS * sizeof(float), cudaMemcpyHostToDevice);

    //d_vector is a pointer on the device pointing to the beginning of the vector, containing nrElements floats.

    int *max_index;
    float *max_val;
    int *d_max_index;
    float *d_max_val;

    max_index = (int*)malloc(NROWS * sizeof(int));
    max_val = (float*)malloc(NROWS * sizeof(float));
    cudaMalloc((void**)&d_max_index, NROWS * sizeof(int));
    cudaMalloc((void**)&d_max_val, NROWS * sizeof(float));

    max_idx_kernel_reduction_within_block<<<grids, threads>>>(d_vector, NCOLS, NROWS);
    max_idx_kernel_final<<<grids2,threads2>>>(d_max_index, d_max_val);




    cudaMemcpy(max_index, d_max_index, NROWS * sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(max_val, d_max_val, NROWS * sizeof(float), cudaMemcpyDeviceToHost);

    for(int z=0;z<20;z++)
printf("%d  ",max_index[z]);

    printf("\n\n\n");

    for(int z=0;z<20;z++)
    printf("%f  ",max_val[z]);

    return 0;
  }

Answer 1

Your code has various issues: 您的代码有很多问题：

You should use proper cuda error checking . 您应该使用正确的cuda错误检查。 That's just a standard boiler-plate statement I make. 那只是我做的一个标准样板声明。 I don't think your code was producing any runtime errors. 我认为您的代码没有产生任何运行时错误。
You should validate your results. 您应该验证您的结果。 I seriously doubt the code was producing sensible results. 我严重怀疑代码是否会产生合理的结果。 The reasons why will become evident. 其原因将变得显而易见。 If you want to prove this to yourself, modify your data initialization to something that is obviously and easily verifiable, such as I have shown, without making any other changes, and you will see that your program produces errors. 如果您想自己证明这一点，请将您的数据初始化修改为显而易见且易于验证的内容（如我所示），而无需进行任何其他更改，您会发现程序会产生错误。
In your kernel, you're not indexing through the arrays correctly. 在内核中，您无法正确索引数组。 Perhaps you don't understand the IDX2C and IDX2F macros. 也许您不了解IDX2C和IDX2F宏。 They are hurting you in two ways: you don't understand the pattern in which they are indexing through your array, and they are killing your coalescing out of global memory ( due to the way you have used them ). 它们以两种方式对您造成伤害：您不了解它们在数组中建立索引的模式，并且它们正在破坏您在全局内存中的合并（ 由于您使用它们的方式 ）。 Whenever we have a construct that is an indexed function of threadIdx.x and threadIdx.y (or blockIdx.y in this case), to maintain proper coalescing amongst adjacent threads, its desirable that the component based on the threadIdx.x not get multiplied by any scaling factors . 每当我们有一个作为threadIdx.x和threadIdx.y （在这种情况下为blockIdx.y ）的索引函数的threadIdx.x ，要在相邻线程之间保持适当的合并，就不希望将基于threadIdx.x的组件相乘受任何比例因子的影响 。 But the way you are passing parameters to IDX2C in your kernel, you are breaking this rule (and also disrupting your desired row-wise access pattern.) So for now, let's get rid of those macros, as they are confusing the issue. 但是，在将参数传递给内核中的IDX2C的方式上，您正在打破此规则（并破坏了所需的按行访问模式。）因此，现在让我们摆脱那些宏，因为它们使问题变得令人困惑。

This is an illegal use of __syncthreads() : 这是__syncthreads()的非法使用：

  if (!threadIdx.x){ // at the shared memory, only the first element (index 0) (actually 2 elements in the first index. max value, and max value index) is what we need. blk_vals[blockIdx.y][blockIdx.x] = vals[0]; // For each window (single x row), the first elements of the blocks are stored into the blk_vals[windowNumber][:] // remember, this is a global variable. blk_idxs[blockIdx.y][blockIdx.x] = idxs[0]; // and the max value index __syncthreads(); }

It's illegal to use it in conditional code unless the condition evaluates teh same for every thread in the block. 除非条件对块中的每个线程都求值，否则在条件代码中使用它是非法的。 It's entirely unneeded there, so we'll just delete it. 完全不需要那里，因此我们将其删除。

Your printouts were indexing up through 20 instead of NROWS. 您的打印输出最多可以索引20个而不是NROWS。

With the above changes, your code goes from being broken (producing incorrect results) to being fixed, and execution time for the kernels on my system goes from 0.929ms to 0.656ms. 经过上述更改，您的代码从被破坏（产生不正确的结果）变为被修复，并且系统上内核的执行时间从0.929ms变为0.656ms。 I attribute all of this to the coalescing fix in item 3 above. 我将所有这些归因于上面第3条中的合并修补程序。

When I profile the before and after cases with nvprof --metrics gld_efficiency ... , it shows 12.5% efficiency with your original code and 53% efficiency with the changes. 当我配置文件之前和案件后nvprof --metrics gld_efficiency ... ，它显示了你的原代码和53％的效率变化12.5％的效率。 Here's the modified code: 这是修改后的代码：

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <iostream>

#define NCOLS 163317 // number of columns
#define NROWS 8 // number of rows
#define nTPB 1024  // Threads per Block. nTPB should be a power-of-2
#define MAX_BLOCKS_X ((NCOLS/nTPB)+1) // # of blocks I will launch

#define FLOAT_MIN -1.0f // lowest anticipated number of the data. Values in array will be compared with this and updated with the larger one


__device__ volatile float blk_vals[NROWS][MAX_BLOCKS_X];
__device__ volatile int   blk_idxs[NROWS][MAX_BLOCKS_X];
// blk_vals and blk_idxs are the results obtained from reduction within each block.
// after 1st reduction, each row will have blk_vals[MAX_BLOCKS_X] array and blk_idxs[MAX_BLOCKS_X]
// these will be passed to the 2nd kernel

__global__ void max_idx_kernel_reduction_within_block(const float *data, const int xSize, const int ySize){  // first kernel. Reduction within blocks
  __shared__ volatile float vals[nTPB]; // Total amount of shared memory per block: 49152 bytes (50 KB). 1024 gives ~ 4KB for single.
  __shared__ volatile int idxs[nTPB]; // ~ 4 KB for single, when nTPB is 1024. each block will have both indices and values

  int idx = threadIdx.x+blockDim.x * blockIdx.x; // idx in the x direction
  int idy = blockIdx.y;
  float my_val = FLOAT_MIN; // lowest possible number
  int my_idx = -1;  // to check whether you DID perform the kernel. Again, it's the idx in the x dir.

  // sweep from global memory
  while (idx < xSize){   // this ensures you don't go out the size of the array's x direction
    float temp = data[idy*xSize+idx];
    if (temp > my_val) {my_val = temp; my_idx = idx;}
    // compare with my_val, and put the bigger value into my_val for next comparison. my_idx is 0 index based
    idx += blockDim.x*gridDim.x;}
                                                                 // until here takes about 6 ms !! very fast!!
  // populate shared memory: takes ~ 270 ms
  vals[threadIdx.x] = my_val;  // put the computed max value for each thread into the shared memory. -> this is the bottleneck!!
  idxs[threadIdx.x] = my_idx;  // do this for index as well -> this is also slow!!

  __syncthreads();

  // sweep in shared memory
  for (int i = (nTPB>>1); i > 0; i>>=1){
    if (threadIdx.x < i)    // the first half threads of the block
      if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
                            // the above is comparing shared memory of threadIdx.x with shared memory of threadIdx.x + i.
                            // then puts the larger value into shared memory of threadIdx.x
    __syncthreads();}       // so now in each block, shared memory's first element (index 0) is the max value and max value index


  // perform block-level reduction
  if (!threadIdx.x){    // at the shared memory, only the first element (index 0) (actually 2 elements in the first index. max value, and max value index) is what we need.
      blk_vals[blockIdx.y][blockIdx.x] = vals[0]; // For each window (single x row), the first elements of the blocks are stored into the blk_vals[windowNumber][:]
                                                // remember, this is a global variable.
      blk_idxs[blockIdx.y][blockIdx.x] = idxs[0]; // and the max value index

  __syncthreads();
}

}

  // originally the following kernel was in the 1st kernel, performed by the last block. So just use one block for this.
__global__ void max_idx_kernel_final(int *result_maxInd, float *result_maxVal){

  __shared__ volatile float vals[nTPB]; //  Total amount of shared memory per block: 49152 bytes (50 KB). 1024 gives ~ 4KB for single.
  __shared__ volatile int idxs[nTPB]; // ~ 4 KB for single, when nTPB is 1024. each block will have these variables!! (vals and idxs)

  int idx = threadIdx.x;
  int idy = blockIdx.y;
  float my_val = FLOAT_MIN;
  int my_idx = -1;  // remember, these are local variables, so each thread has this variable. This local variable is independent from other thread's local variable
  while (idx < MAX_BLOCKS_X ){                                                          // ?? confused whether it should be gridDim.x (actual # of blocks launched) or MAX_BLOCKS_X (# of elements in x dir of the global array blk_vals)
    float temp = blk_vals[idy][idx];
    if (temp > my_val)
        {my_val = temp; my_idx = blk_idxs[idy][idx]; }
    idx += blockDim.x;} // all threads in this single block (single in the x dir) are working, so you should loop over blockDim.x.
                      // Imagine where gridDim.x (# of blocks) is huge so that you need to loop over to get the max value and index
                      // After this, each thread in the block has a local variable (max value and max value index).
                      // So far it was sort of a reduction, but instead of pairing values we just looped over the blk_vals and blk_idxs
  // populate shared memory
  idx = threadIdx.x;
  vals[idx] = my_val;   // This is now shared memory. This is because reduction requires comparison between different elements
  idxs[idx] = my_idx;   // my_idx value is 0 based. This is done for all blocks (in the y direction)
  __syncthreads();
  // Now the final task is to do reduction for all threads in our single block (single block in the x dir, NROWS blocks in the y dir)!

// sweep in shared memory
  for (int i = (nTPB>>1); i > 0; i>>=1) {
    if (idx < i) // the first half threads of the block
      if (vals[idx] < vals[idx + i]) {vals[idx] = vals[idx+i]; idxs[idx] = idxs[idx+i]; }
    __syncthreads();} // now all the results are in threadIdx.x == 0 for each block (there are NROWS blocks in the y dir)
  // 0th thread. the results are in shared memory, not the local memory, so any thread could do the following. We just selected the 0th thread for no reason. If several threads try to do this, that would be a problem, since we'll have to wait for them

  if(!threadIdx.x){
        result_maxInd[idy] = idxs[0]; // the final result for each row goes into the corresponding position (blockIdx.y)
        result_maxVal[idy] = vals[0];
      }
}


int main(){

    dim3 grids(MAX_BLOCKS_X, NROWS);
    dim3 threads(nTPB,1);
    dim3 grids2(1,NROWS);
    dim3 threads2(nTPB);
    float *d_vector, *h_vector;
    h_vector = (float*)malloc(NROWS * NCOLS * sizeof(float));
    memset(h_vector, 0, NROWS*NCOLS*sizeof(float));
    for (int i =  0; i < NROWS; i++)
      h_vector[i*NCOLS + i] = 10.0f;  // create definite max element per row
    cudaMalloc(&d_vector, NROWS * NCOLS * sizeof(float));
    cudaMemcpy(d_vector, h_vector, NROWS * NCOLS * sizeof(float), cudaMemcpyHostToDevice);

    //d_vector is a pointer on the device pointing to the beginning of the vector, containing nrElements floats.

    int *max_index;
    float *max_val;
    int *d_max_index;
    float *d_max_val;

    max_index = (int*)malloc(NROWS * sizeof(int));
    max_val = (float*)malloc(NROWS * sizeof(float));
    cudaMalloc((void**)&d_max_index, NROWS * sizeof(int));
    cudaMalloc((void**)&d_max_val, NROWS * sizeof(float));

    cudaEvent_t start, stop;
    cudaEventCreate(&start); cudaEventCreate(&stop);
    cudaEventRecord(start);
    max_idx_kernel_reduction_within_block<<<grids, threads>>>(d_vector, NCOLS, NROWS);
    max_idx_kernel_final<<<grids2,threads2>>>(d_max_index, d_max_val);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float et;
    cudaEventElapsedTime(&et, start, stop);
    printf("elapsed time: %fms\n", et);

    cudaMemcpy(max_index, d_max_index, NROWS * sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(max_val, d_max_val, NROWS * sizeof(float), cudaMemcpyDeviceToHost);

    for(int z=0;z<NROWS;z++)
      printf("%d  ",max_index[z]);

    printf("\n\n\n");

    for(int z=0;z<NROWS;z++)
      printf("%f  ",max_val[z]);
    printf("\n");
    return 0;
}
$

如何使用cuda对沿行方向的巨大2D矩阵进行约简？（每一行的最大值和最大值的索引）

问题描述

1 个解决方案

解决方案1
7 已采纳 2015-08-01 02:25:45

如何使用cuda对沿行方向的巨大2D矩阵进行约简？ （每一行的最大值和最大值的索引）

问题描述

1 个解决方案

解决方案1 7 已采纳 2015-08-01 02:25:45

如何使用cuda对沿行方向的巨大2D矩阵进行约简？（每一行的最大值和最大值的索引）

解决方案1
7 已采纳 2015-08-01 02:25:45