CUDA矩阵乘法大小

Question

i am new to cuda c..i wrote a basic matrix multiplication programme using shared memory..but the problem is i cannot increase the matrix size beyond 288 and if i does so i get stack overflow error..i have nvidia gtx 480 gpu..could anyone pls tell me how to increase the size and what mistakes i'm doing 我是cuda c。的新手..我使用共享内存编写了一个基本的矩阵乘法程序..但问题是我无法将矩阵大小增加到288以上，如果这样做，我会得到堆栈溢出错误..我有nvidia gtx 480 gpu ..有人可以告诉我如何增加尺寸以及我在做什么错误

#define tile_width 16
#define width 288
void mat_mul_kernel1(int *a,int *b,int *c)
{
    int row= blockIdx.y*blockDim.y + threadIdx.y;
    int col= blockIdx.x*blockDim.x + threadIdx.x;
    int pvalue=0;
    __shared__ int sha[tile_width*tile_width];
    __shared__ int shb[tile_width*tile_width];

    for (int m=0;m<width/tile_width;m++)
    {
        sha[threadIdx.y*tile_width+threadIdx.x]=a[row*width+(m*tile_width)+threadIdx.x];
        shb[threadIdx.y*tile_width+threadIdx.x]=b[(m*tile_width+threadIdx.y)*width+col];
        __syncthreads();
        for (int k=0;k<tile_width;k++)
            pvalue+=sha[threadIdx.y*tile_width+k]*shb[k*tile_width+threadIdx.x];
        __syncthreads();
    }

    c[row*width+col]=pvalue;
}
int main()
{
    int a[width*width],b[width*width],c[width*width];
    int *deva,*devb,*devc;
    float etime;
    for (int i=0;i<width;i++)
    {
        for(int j=0;j<width;j++)
        {
            a[i*width+j]=1;
            b[i*width+j]=1;
        }
    }
    cudaEvent_t start,stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    dim3 dimGrid((int)(width)/tile_width,(int)(width)/tile_width);
    dim3 dimBlock(tile_width,tile_width);
    cudaError_t error;
    error=cudaMalloc((void**)&deva,width*width*sizeof(int));
    if(error!= cudaSuccess)
    {
        printf("error at a allocation");
        exit(EXIT_FAILURE);
    }
    error=cudaMemcpy(deva,a,width*width*sizeof(int),cudaMemcpyHostToDevice);
    if(error!= cudaSuccess)
    {
        printf("error at a copying");
        exit(EXIT_FAILURE);
    }
    error=cudaMalloc((void**)&devb,width*width*sizeof(int));
    if(error!= cudaSuccess)
    {
        printf("error at b allocation");
        exit(EXIT_FAILURE);
    }
    error=cudaMemcpy(devb,b,width*width*sizeof(int),cudaMemcpyHostToDevice);
    if(error!= cudaSuccess)
    {
        printf("error at b copying");
        exit(EXIT_FAILURE);
    }
    error=cudaMalloc((void**)&devc,width*width*sizeof(int));
    if(error!= cudaSuccess)
    {
        printf("error at c allocation");
        exit(EXIT_FAILURE);
    }
    cudaEventRecord(start,0);
    mat_mul_kernel1<<<dimGrid,dimBlock,tile_width*tile_width*sizeof(int)>>>(deva,devb,devc);
    cudaEventRecord(stop,0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&etime,start,stop);
    error=cudaMemcpy(c,devc,width*width*sizeof(int),cudaMemcpyDeviceToHost);
    if(error!= cudaSuccess)
    {
        printf("error at c copying");
        //exit(EXIT_FAILURE);
    }
    cudaFree(deva);
    cudaFree(devb);
    cudaFree(devc);
    printf("ElapsedTime %f milliseconds",etime);
}

Answer 1

The problem you see has nothing to do with CUDA. 您看到的问题与CUDA无关。 The problems are your arrays a, b, c. 问题是您的数组a，b，c。 They are allocated on the stack. 它们被分配在堆栈上。 They have a size of 288 x 288 x siezof(int) x 3 what leads to 972kB ( sizeof(int) = 4 byte ). 它们的大小为288 x 288 x siezof(int) x 3这导致972kB（ sizeof(int) = 4 byte ）。 So I asume your hitting the standard maximum stack size, which lies, as far as I know, arround 1MB. 因此，我假设您达到了标准的最大堆栈大小，据我所知，最大堆栈大小约为1MB。

Try to allocate your arrays dynamically on the heap 尝试在堆上动态分配数组

int* a = (int*) malloc(width * width * sizeof(int));

and free the memory at the end 并在最后释放内存

free(a);

CUDA矩阵乘法大小

问题描述

1 个解决方案

解决方案1
4 已采纳 2013-09-06 05:32:26

CUDA矩阵乘法大小

问题描述

1 个解决方案

解决方案1 4 已采纳 2013-09-06 05:32:26

解决方案1
4 已采纳 2013-09-06 05:32:26