将两个CUDA内核合并为一个

Question

I am using CUDA to compute out = C(b(A(in))) , where functions A and C are convolutions, and b is an element-wise function. 我正在使用CUDA计算out = C(b(A(in))) ，其中函数A和C是卷积，而b是逐元素函数。 A toy example is: 一个玩具的例子是：

#define N 1000

__device__ float b(float d_in){return min(d_in + 10.0f, 100.0f);}
__global__ void bA(float *d_in, float *d_out){
    const int x = threadIdx.x + blockIdx.x * blockDim.x;
    if (x >= N)  return;

    // replicate boundary
    int x_left  = max(x-1, 0); int x_right = min(x+1, N-1);

    d_out[x] = b( d_in[x_left] + d_in[x] + d_in[x_right] );
}
__global__ void C(float *d_in, float *d_out){
    const int x = threadIdx.x + blockIdx.x * blockDim.x;
    if (x >= N)  return;

    // replicate boundary        
    int x_left  = max(x-1, 0); int x_right = min(x+1, N-1);

    d_out[x] = d_in[x_left] + d_in[x] + d_in[x_right];
}
void myfunc(float *d_data, float *d_temp){
    dim3 threads(256);
    dim3 blocks( (N + threads.x - 1) / threads.x ); // divide up

    // kernels that I would like to merge into one:
    bA<<<blocks, threads>>>(d_data, d_temp);
    C <<<blocks, threads>>>(d_temp, d_data);
}

Computing like this needs an additional variable d_temp , which I do not want. 像这样的计算需要一个额外的变量d_temp ，我不需要。 So I would like to merge these kernels into one, ie one kernel for computing C(b(A(in))) . 因此，我想将这些内核合并为一个，即用于计算C(b(A(in)))一个内核。

One difficulty is, how can I hold the temporary results from b(A(in)) , and then perform the convolution function C() ? 一个难题是，如何保存b(A(in))的临时结果，然后执行卷积函数C() ？ I have tried using shared memory, but loss at how to load the temporary result b(A(in)) to the shared memory. 我曾尝试使用共享内存，但是在如何将临时结果b(A(in))加载到共享内存时感到迷loss。 For example: 例如：

#define BLOCK_SIZE 32

__global__ void CbA(float *d_in, float *d_out){
    const int x = threadIdx.x + blockIdx.x * blockDim.x;
    if (x >= N)  return;

    // replicate boundary
    int x_left  = max(x-1, 0); int x_right = min(x+1, N-1);

    // temp result for b(A(in))
    float temp = b( d_in[x_left] + d_in[x] + d_in[x_right] );

    // shared memory for convolution (stencil size of 3)
    __shared__ float shmem[BLOCK_SIZE+2];

    // load center part to shared memory
    shmem[threadIdx.x+1] = temp;

    // but how to load boundary parts from temp to shmem?
    // ...

    __syncthreads();

    // perform function C()
    // ...
}

Any advice or hints are highly appreciated. 任何建议或提示都将受到高度赞赏。

Answer 1

First a comment about 首先对

// load center part to shared memory
shmem[threadIdx.x+1] = temp;

I would call that saving to the shared memory... 我将这种保存称为共享内存...

Besides that a few ideas: 除此之外，还有一些想法：

Use the first and last thread in a block to only calculate `b(A(in))` 使用块中的第一个和最后一个线程仅计算`b(A(in))`

Of course you then have to consider this in your calculation of x ( const int x = threadIdx.x + blockIdx.x * (blockDim.x-2); ) and invoke your kernel with more threads/blocks. 当然，然后您必须在x的计算中考虑这一点（ const int x = threadIdx.x + blockIdx.x * (blockDim.x-2); ），并使用更多线程/块调用内核。
You will then have two threads per block idle when you perform C() . 然后，在执行C()时，每个块将有两个线程空闲。 But that should not have a big impact. 但这不会产生太大影响。
Here's the kernel. 这是内核。 It is easier to understand if you try to visualize the flow of the calculations. 如果您尝试可视化计算流程，则更容易理解。

__global__ void CbA(float *d_in, float *d_out)
{
  const int x = threadIdx.x + blockIdx.x * (blockDim.x - 2);
  if (x >= N) return;
  int x_left  = max(x-1, 0); int x_right = min(x+1, N-1);
  float temp = b( d_in[x_left] + d_in[x] + d_in[x_right] );
  __shared__ float shmem[BLOCK_SIZE]; // = 256
  shmem[threadIdx.x] = temp;
  __syncthreads();
  if (threadIdx.x > 0 && threadIdx.x < blockDim.x-1)
    d_out[x-1] = shmem[threadIdx.x-1] + d_in[threadIdx.x] + d_in[threadIdx.x+1];
}

Let one thread in a block also perform `b(A())` for the "boundary parts" of a block 让块中的一个线程也对块的“边界部分”执行`b(A())`

But then you would only utilize 1 out of 32 threads for that calculation for every block. 但是，那么对于每个块，您将仅使用32个线程中的1个进行该计算。 The worst case would be a ratio of 1/32 for the whole SM for the time of the additional computation. 最坏的情况是整个SM在附加计算时的比率为1/32。

...
// but how to load boundary parts from temp to shmem?
if (threadIdx.x == 0)
{
  {
    const int x = 0 + blockIdx.x * blockDim.x;
    int x_left = max(x-1, 0); int x_right = min(x+1, N-1);
    float temp = b( d_in[x_left] + d_in[x] + d_in[x_right] );
    shmem[0] = temp;
  }
  {
    const int x = blockDim.x-1 + blockIdx.x * blockDim.x;
    int x_left = max(x-1, 0); int x_right = min(x+1, N-1);
    float temp = b( d_in[x_left] + d_in[x] + d_in[x_right] );
    shmem[blockDim.x-1] = temp;
  }
}
// perform function C()
...

Avoid shared memory usage 避免共享内存使用

(At least in your simplified example) The value of temp is a result of a very simple calculation. （至少在您的简化示例中） temp的值是非常简单的计算结果。 Maybe it is best to calculate all values you need to perform C() in a thread locally in that thread. 也许最好计算在该线程本地线程中执行C()所需的所有值。

__global__ void CbA(float *d_in, float *d_out)
{
  const int x = threadIdx.x + blockIdx.x * blockDim.x;
  if (x >= N)  return;

  float temp[3];
  for (int i(0); i < 3; ++i)
  {
    int x_left  = max(x-1-1+i, 0); int x_right = min(x+1-1+i, N-1);
    temp[i] = b( d_in[x_left] + d_in[x-1+i] + d_in[x_right] );
  }

  // perform function C()
  ...
}

将两个CUDA内核合并为一个

问题描述

1 个解决方案

解决方案1
2 已采纳 2018-10-17 09:37:23

Use the first and last thread in a block to only calculate `b(A(in))` 使用块中的第一个和最后一个线程仅计算`b(A(in))`

Let one thread in a block also perform `b(A())` for the "boundary parts" of a block 让块中的一个线程也对块的“边界部分”执行`b(A())`

Avoid shared memory usage 避免共享内存使用

将两个CUDA内核合并为一个

问题描述

1 个解决方案

解决方案1 2 已采纳 2018-10-17 09:37:23

Use the first and last thread in a block to only calculate b(A(in)) 使用块中的第一个和最后一个线程仅计算b(A(in))

Let one thread in a block also perform b(A()) for the "boundary parts" of a block 让块中的一个线程也对块的“边界部分”执行b(A())

Avoid shared memory usage 避免共享内存使用

解决方案1
2 已采纳 2018-10-17 09:37:23

Use the first and last thread in a block to only calculate `b(A(in))` 使用块中的第一个和最后一个线程仅计算`b(A(in))`

Let one thread in a block also perform `b(A())` for the "boundary parts" of a block 让块中的一个线程也对块的“边界部分”执行`b(A())`