简体   繁体   English

无法运行CUDA内核:请求启动的资源过多

[英]Cannot run CUDA kernel : too many resources requested for launch

Take a look on my self written cuda kernel. 看一下我自己编写的cuda内核。 I had a big kernel but it returned me error message. 我有一个大内核,但它返回了错误消息。 Then I simplified it and found that it fails on one loop. 然后我简化了它,发现它在一个循环中失败了。 I simplified this loop and found that if I use int value or constant value to fill data[threadIdx.x] in loop it works fine. 我简化了此循环,发现如果我使用int值或常量值在循环中填充data [threadIdx.x],则可以正常工作。 But if I use double type value it returns an error. 但是,如果我使用double类型的值,它将返回错误。

Advice : if you are not correctly coping your data from host to device, you can get "warning: Cuda API error detected: cudaLaunch returned (0x7)" message when you use Nsight or you can get segmentation fault error when you run your app from terminal 建议 :如果您没有正确地将数据从主机复制到设备,则在使用Nsight时会收到“警告:检测到Cuda API错误:cudaLaunch返回(0x7)”消息,或者从以下位置运行应用程序时可能会出现分段错误错误终奌站

__global__ void sumSeries(double* dSum,int* totalThreadNumber){
    volatile  __shared__ double data[768];
    double var=0;
    data[threadIdx.x]=0;
    for ( int i = 10 ; i < 20 ;++i){
        var=i;
        data[threadIdx.x] += (var)/(var*var+1);
        __syncthreads();
    }


}

Why it does not work? 为什么不起作用?

int main() {

    int threadsPerBlock=768;
    int blockCount=8;

    int *hostThreadNumber=new int ;
        *hostThreadNumber=threadsPerBlock*blockCount;
    int* deviceThreadNumber=NULL;

    double* deviceSum=NULL;
    double* hostSum=(double*)malloc(blockCount);


    cudaError_t cuerr=cudaDeviceSetCacheConfig(cudaFuncCachePreferShared);
    if (cuerr != cudaSuccess){
        std::cout<<"Cant SetCacheConfig: "<<cudaGetErrorString(cuerr)<<std::endl;
        return -1;
    }

    cuerr=cudaMalloc(&deviceSum,blockCount*sizeof(double));//размер дабла*число блоков
    if (cuerr != cudaSuccess){
        std::cout<<"Cant allocate memory for deviceSum: "<<cudaGetErrorString(cuerr)<<std::endl;
        return -1;
    }
    cuerr=cudaMalloc(&deviceThreadNumber,sizeof(int));
    if (cuerr != cudaSuccess){
        std::cout<<"Cant allocate memory for deviceThreadNumber: "<<cudaGetErrorString(cuerr)<<std::endl;
        return -1;
    }

    cuerr = cudaMemcpy(deviceSum,hostSum,blockCount*sizeof(double),cudaMemcpyHostToDevice);
    if (cuerr != cudaSuccess){
        std::cout<<"Can not copy hostSum to device: "<<cudaGetErrorString(cuerr)<<std::endl;
        return -1;
    }
    cuerr = cudaMemcpy(deviceThreadNumber,hostThreadNumber,sizeof(int),cudaMemcpyHostToDevice);
    if (cuerr != cudaSuccess){
        std::cout<<"Can not copy hostThreadNumber to device: "<<cudaGetErrorString(cuerr)<<std::endl;
        return -1;
    }

    sumSeries<<<dim3(blockCount),dim3(threadsPerBlock)>>>(deviceSum,deviceThreadNumber);
    cuerr=cudaGetLastError();
    if (cuerr != cudaSuccess){
        std::cout<<"Cuda kernel error: "<<cudaGetErrorString(cuerr)<<std::endl;
        return -1;
    }
    cuerr= cudaDeviceSynchronize();
    if (cuerr != cudaSuccess){
        std::cout<<"Can not synchronize cuda kernel : "<<cudaGetErrorString(cuerr)<<std::endl;
        return -1;
    }
    cuerr= cudaMemcpy(hostSum,deviceSum,blockCount*sizeof(double),cudaMemcpyDeviceToHost);
    if (cuerr != cudaSuccess){
        std::cout<<"Can not copy data to host: "<<cudaGetErrorString(cuerr)<<std::endl;
        return -1;
    }
    cudaFree(deviceSum);
    cudaFree(deviceThreadNumber);
    return 0;
}

You just allocated 8 bytes memory for hostSum 您刚刚为hostSum分配了8个字节的内存

double* hostSum=(double*)malloc(blockCount)

That's wrong if I assumed that you want to allocate blockCount * sizeof(double) bytes for it, because you allocate this amount of memory for deviceSum and uses it for the memory copy between host and device. 如果我假设您想为其分配blockCount * sizeof(double)个字节,那是错误的,因为您为deviceSum分配了此内存量,并将其用于主机和设备之间的内存副本。

cuerr = cudaMalloc(&deviceSum,blockCount*sizeof(double));

cuerr = cudaMemcpy(deviceSum,hostSum,blockCount*sizeof(double),cudaMemcpyHostToDevice);

cuerr= cudaMemcpy(hostSum,deviceSum,blockCount*sizeof(double),cudaMemcpyDeviceToHost);

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM