[英]Shared memory matrix multiplication kernel
I am attempting to implement a shared memory based matrix multiplication kernel as outlined in the CUDA C Programming Guide. 我正在尝试实现《 CUDA C编程指南》中概述的基于共享内存的矩阵乘法内核。 The following is the kernel: 以下是内核:
__global__ void matrixMultiplyShared(float * A, float * B, float * C,
int ARows, int AColumns,
int BRows, int BColumns,
int CRows, int CColumns) {
float * CSub = &C[CColumns * 16 * blockIdx.y + 16 * blockIdx.x];
float CValue = 0;
for (int k = 0; k < (AColumns / 16); ++k) {
float * ASub = &A[AColumns * 16 * blockIdx.y + 16 * k];
float * BSub = &B[AColumns*16*k + 16*blockIdx.y];
__shared__ float As[16][16];
__shared__ float Bs[16][16];
As[threadIdx.y][threadIdx.x] = ASub[threadIdx.y*AColumns+threadIdx.x];
Bs[threadIdx.y][threadIdx.x] = BSub[threadIdx.y*AColumns+threadIdx.x];
__syncthreads();
for (int n = 0; n < 16; ++n)
CValue += As[threadIdx.y][n] * Bs[n][threadIdx.x];
__syncthreads();
}
CSub[threadIdx.x*CColumns+threadIdx.y]=CValue;
}
While the following is the call to the kernel: 以下是对内核的调用:
dim3 dimBlock(16, 16, 1);
dim3 dimGrid;
dimGrid.x = (CColumns + dimBlock.x - 1)/dimBlock.x;
dimGrid.y = (CRows + dimBlock.y - 1)/dimBlock.y;
matrixMultiplyShared<<<dimGrid , dimBlock>>>(deviceA , deviceB , deviceC , ARows , AColumns, BRows ,BColumns , CRows , CColumns);
Unfortunately this seems to produce incorrect results. 不幸的是,这似乎产生了错误的结果。
Any assistance/explanations would be greatly appreciated. 任何帮助/解释将不胜感激。
There are at least 2 basic errors in your kernel, both relatively trivial. 您的内核中至少有2个基本错误,两者都很琐碎。 Where you have this: 您在哪里:
float * BSub = &B[AColumns*16*k + 16*blockIdx.y];
You should use this: 您应该使用此:
float * BSub = &B[AColumns*16*k + 16*blockIdx.x];
And where you have this: 而你有这个:
CSub[threadIdx.x*CColumns+threadIdx.y]=CValue;
You should use this: 您应该使用此:
CSub[threadIdx.y*CColumns+threadIdx.x]=CValue;
This should allow you to get basic correctness under the following conditions: 在以下情况下,这应该可以使您获得基本的正确性:
Fixing the square matrix limitation is not difficult. 固定方阵限制并不困难。 Fixing the dimension limitation on the tile dimension involves considerable changes to the kernel, in order to: 在图块尺寸上固定尺寸限制涉及对内核的重大更改,以便:
Since your code doesn't comprehend any of this, I wasn't sure if you're asking about it and chose not to address those issues specifically. 由于您的代码不了解任何这些内容,因此我不确定您是否要询问它,并选择不专门解决这些问题。
I was able to get the following adaptation of your code working as a basic example: (note that for the benefit of reduced code size to look at, I have dispensed with usual CUDA error checking . Please don't use this as a representative example of good coding. Do proper error checking. The point of my answer is not to explain good CUDA error checking but to show an algorithmically correct example.) 我可以对您的代码进行以下修改,作为一个基本示例:(请注意,为了减少代码量,我省去了通常的CUDA错误检查 。请不要将其用作代表示例正确的错误检查。我的回答不是说明良好的CUDA错误检查,而是显示算法上正确的示例。)
#include <stdio.h>
#include <math.h>
#define TILE_DIM 16
#define DIMX 256
#define DIMY 256
#define RES 0.1
__global__ void matrixMultiplyShared(float * A, float * B, float * C,
int ARows, int AColumns,
int BRows, int BColumns,
int CRows, int CColumns) {
float CValue = 0;
if (((blockIdx.y * blockDim.y + threadIdx.y)< CRows) && ((blockIdx.x * blockDim.x + threadIdx.x) < CColumns)) {
for (int k = 0; k < (AColumns / TILE_DIM); ++k) {
float * ASub = &A[AColumns * TILE_DIM * blockIdx.y + TILE_DIM * k];
float * BSub = &B[AColumns*TILE_DIM*k + TILE_DIM*blockIdx.x];
__shared__ float As[TILE_DIM][TILE_DIM];
__shared__ float Bs[TILE_DIM][TILE_DIM];
As[threadIdx.y][threadIdx.x] = ASub[threadIdx.y*AColumns+threadIdx.x];
Bs[threadIdx.y][threadIdx.x] = BSub[threadIdx.y*AColumns+threadIdx.x];
__syncthreads();
for (int n = 0; n < TILE_DIM; ++n)
CValue += As[threadIdx.y][n] * Bs[n][threadIdx.x];
__syncthreads();
}
C[((blockIdx.y * blockDim.y + threadIdx.y)*CColumns)+(blockIdx.x*blockDim.x)+threadIdx.x]=CValue;
}
}
void matrixMultiplyCPU(float * A, float * B, float * C,
int ARows, int AColumns,
int BRows, int BColumns,
int CRows, int CColumns) {
for (int i = 0; i<ARows; i++)
for (int j=0; j<BColumns; j++){
float Ctemp = 0.0;
for (int k=0; k<AColumns; k++)
Ctemp += A[i*AColumns + k] * B[k*BColumns+j];
C[i*CColumns+j] = Ctemp;
}
}
int main(){
int CColumns = DIMY, CRows=DIMX, AColumns=DIMY, ARows=DIMX, BColumns=DIMY, BRows=DIMX;
dim3 dimBlock(TILE_DIM, TILE_DIM, 1);
dim3 dimGrid;
dimGrid.x = (CColumns + dimBlock.x - 1)/dimBlock.x;
dimGrid.y = (CRows + dimBlock.y - 1)/dimBlock.y;
float *deviceA, *deviceB, *deviceC;
float hostA[DIMY][DIMX];
float hostB[DIMY][DIMX];
float hostC[DIMY][DIMX];
float hostCp[DIMY][DIMX];
for (int x = 0; x<DIMX; x++)
for (int y = 0; y<DIMY; y++) {
hostA[y][x] = rand()/(float)RAND_MAX;
hostB[y][x] = rand()/(float)RAND_MAX;
}
cudaMalloc((void **)&deviceA, DIMX*DIMY*sizeof(float));
cudaMalloc((void **)&deviceB, DIMX*DIMY*sizeof(float));
cudaMalloc((void **)&deviceC, DIMX*DIMY*sizeof(float));
cudaMemcpy(deviceA, hostA, DIMX*DIMY*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(deviceB, hostB, DIMX*DIMY*sizeof(float), cudaMemcpyHostToDevice);
matrixMultiplyShared<<<dimGrid , dimBlock>>>(deviceA , deviceB , deviceC , ARows , AColumns, BRows ,BColumns , CRows , CColumns);
cudaMemcpy(hostC, deviceC, DIMX*DIMY*sizeof(float), cudaMemcpyDeviceToHost);
matrixMultiplyCPU(&(hostA[0][0]) , &(hostB[0][0]) , &(hostCp[0][0]) , ARows , AColumns, BRows ,BColumns , CRows , CColumns);
for (int y = 0; y<DIMY; y++)
for (int x = 0; x<DIMX; x++)
if (fabs(hostCp[y][x] - hostC[y][x]) > RES)
{
printf("Error at offset y=%d,x=%d, CPU = %f, GPU = %f\n", y, x, hostCp[y][x], hostC[y][x]);
return 1;
}
printf("Finished!\n");
return 0;
}
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.