简体   繁体   English

共享内存矩阵乘法内核

[英]Shared memory matrix multiplication kernel

I am attempting to implement a shared memory based matrix multiplication kernel as outlined in the CUDA C Programming Guide. 我正在尝试实现《 CUDA C编程指南》中概述的基于共享内存的矩阵乘法内核。 The following is the kernel: 以下是内核:

 __global__ void matrixMultiplyShared(float * A, float * B, float * C,
                     int ARows, int AColumns,
                     int BRows, int BColumns,
                     int CRows, int CColumns) {
     float * CSub = &C[CColumns * 16 * blockIdx.y + 16 * blockIdx.x];
     float CValue = 0;
 for (int k = 0; k < (AColumns / 16); ++k) {
         float * ASub =  &A[AColumns * 16 * blockIdx.y + 16 * k];
         float * BSub = &B[AColumns*16*k + 16*blockIdx.y];
         __shared__ float As[16][16];
         __shared__ float Bs[16][16];
         As[threadIdx.y][threadIdx.x] = ASub[threadIdx.y*AColumns+threadIdx.x];
         Bs[threadIdx.y][threadIdx.x] = BSub[threadIdx.y*AColumns+threadIdx.x];
         __syncthreads();
         for (int n = 0; n < 16; ++n)
        CValue += As[threadIdx.y][n] * Bs[n][threadIdx.x];
         __syncthreads();
     }
     CSub[threadIdx.x*CColumns+threadIdx.y]=CValue;
 }

While the following is the call to the kernel: 以下是对内核的调用:

 dim3 dimBlock(16, 16, 1);
 dim3 dimGrid;
 dimGrid.x = (CColumns + dimBlock.x - 1)/dimBlock.x;
 dimGrid.y = (CRows + dimBlock.y - 1)/dimBlock.y;
 matrixMultiplyShared<<<dimGrid , dimBlock>>>(deviceA , deviceB , deviceC , ARows , AColumns, BRows ,BColumns , CRows , CColumns);

Unfortunately this seems to produce incorrect results. 不幸的是,这似乎产生了错误的结果。

Any assistance/explanations would be greatly appreciated. 任何帮助/解释将不胜感激。

There are at least 2 basic errors in your kernel, both relatively trivial. 您的内核中至少有2个基本错误,两者都很琐碎。 Where you have this: 您在哪里:

     float * BSub = &B[AColumns*16*k + 16*blockIdx.y];

You should use this: 您应该使用此:

     float * BSub = &B[AColumns*16*k + 16*blockIdx.x];

And where you have this: 而你有这个:

 CSub[threadIdx.x*CColumns+threadIdx.y]=CValue;

You should use this: 您应该使用此:

 CSub[threadIdx.y*CColumns+threadIdx.x]=CValue;

This should allow you to get basic correctness under the following conditions: 在以下情况下,这应该可以使您获得基本的正确性:

  1. square matrices 方阵
  2. matrix dimensions evenly divisible by tile dimension 矩阵尺寸可以被图块尺寸均匀除尽

Fixing the square matrix limitation is not difficult. 固定方阵限制并不困难。 Fixing the dimension limitation on the tile dimension involves considerable changes to the kernel, in order to: 在图块尺寸上固定尺寸限制涉及对内核的重大更改,以便:

  1. not process out-of-range elements 不处理超出范围的元素
  2. properly populate your shared memory area with values that are appropriate in the "border" regions 使用“边界”区域中的适当值正确填充共享内存区域

Since your code doesn't comprehend any of this, I wasn't sure if you're asking about it and chose not to address those issues specifically. 由于您的代码不了解任何这些内容,因此我不确定您是否要询问它,并选择不专门解决这些问题。

I was able to get the following adaptation of your code working as a basic example: (note that for the benefit of reduced code size to look at, I have dispensed with usual CUDA error checking . Please don't use this as a representative example of good coding. Do proper error checking. The point of my answer is not to explain good CUDA error checking but to show an algorithmically correct example.) 我可以对您的代码进行以下修改,作为一个基本示例:(请注意,为了减少代码量,我省去了通常的CUDA错误检查 。请不要将其用作代表示例正确的错误检查。我的回答不是说明良好的CUDA错误检查,而是显示算法上正确的示例。)

#include <stdio.h>
#include <math.h>
#define TILE_DIM 16
#define DIMX 256
#define DIMY 256
#define RES 0.1

__global__ void matrixMultiplyShared(float * A, float * B, float * C,
                     int ARows, int AColumns,
                     int BRows, int BColumns,
                     int CRows, int CColumns) {
     float CValue = 0;
     if (((blockIdx.y * blockDim.y + threadIdx.y)< CRows) && ((blockIdx.x * blockDim.x + threadIdx.x) < CColumns)) {
       for (int k = 0; k < (AColumns / TILE_DIM); ++k) {
         float * ASub =  &A[AColumns * TILE_DIM * blockIdx.y + TILE_DIM * k];
         float * BSub = &B[AColumns*TILE_DIM*k + TILE_DIM*blockIdx.x];
         __shared__ float As[TILE_DIM][TILE_DIM];
         __shared__ float Bs[TILE_DIM][TILE_DIM];
         As[threadIdx.y][threadIdx.x] = ASub[threadIdx.y*AColumns+threadIdx.x];
         Bs[threadIdx.y][threadIdx.x] = BSub[threadIdx.y*AColumns+threadIdx.x];
         __syncthreads();
         for (int n = 0; n < TILE_DIM; ++n)
         CValue += As[threadIdx.y][n] * Bs[n][threadIdx.x];
         __syncthreads();
       }
       C[((blockIdx.y * blockDim.y + threadIdx.y)*CColumns)+(blockIdx.x*blockDim.x)+threadIdx.x]=CValue;
     }
 }


void matrixMultiplyCPU(float * A, float * B, float * C,
                     int ARows, int AColumns,
                     int BRows, int BColumns,
                     int CRows, int CColumns) {
  for (int i = 0; i<ARows; i++)
    for (int j=0; j<BColumns; j++){
      float Ctemp = 0.0;
      for (int k=0; k<AColumns; k++)
        Ctemp += A[i*AColumns + k] * B[k*BColumns+j];
      C[i*CColumns+j] = Ctemp;
      }

}
int main(){
 int CColumns = DIMY, CRows=DIMX, AColumns=DIMY, ARows=DIMX, BColumns=DIMY, BRows=DIMX;
 dim3 dimBlock(TILE_DIM, TILE_DIM, 1);
 dim3 dimGrid;
 dimGrid.x = (CColumns + dimBlock.x - 1)/dimBlock.x;
 dimGrid.y = (CRows + dimBlock.y - 1)/dimBlock.y;
 float *deviceA, *deviceB, *deviceC;
 float hostA[DIMY][DIMX];
 float hostB[DIMY][DIMX];
 float hostC[DIMY][DIMX];
 float hostCp[DIMY][DIMX];
 for (int x = 0; x<DIMX; x++)
   for (int y = 0; y<DIMY; y++) {
     hostA[y][x] = rand()/(float)RAND_MAX;
     hostB[y][x] = rand()/(float)RAND_MAX;
     }
  cudaMalloc((void **)&deviceA, DIMX*DIMY*sizeof(float));
  cudaMalloc((void **)&deviceB, DIMX*DIMY*sizeof(float));
  cudaMalloc((void **)&deviceC, DIMX*DIMY*sizeof(float));
  cudaMemcpy(deviceA, hostA, DIMX*DIMY*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(deviceB, hostB, DIMX*DIMY*sizeof(float), cudaMemcpyHostToDevice);
  matrixMultiplyShared<<<dimGrid , dimBlock>>>(deviceA , deviceB , deviceC , ARows , AColumns, BRows ,BColumns , CRows , CColumns);
  cudaMemcpy(hostC, deviceC, DIMX*DIMY*sizeof(float), cudaMemcpyDeviceToHost);
  matrixMultiplyCPU(&(hostA[0][0]) , &(hostB[0][0]) , &(hostCp[0][0]) , ARows , AColumns, BRows ,BColumns , CRows , CColumns);

 for (int y = 0; y<DIMY; y++)
   for (int x = 0; x<DIMX; x++)
     if (fabs(hostCp[y][x] - hostC[y][x]) > RES)
       {
       printf("Error at offset y=%d,x=%d, CPU = %f, GPU = %f\n", y, x, hostCp[y][x], hostC[y][x]);
       return 1;
       }
 printf("Finished!\n");
 return 0;
}

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM