[英]Cannot run CUDA kernel : too many resources requested for launch
Take a look on my self written cuda kernel. 看一下我自己编写的cuda内核。 I had a big kernel but it returned me error message. 我有一个大内核,但它返回了错误消息。 Then I simplified it and found that it fails on one loop. 然后我简化了它,发现它在一个循环中失败了。 I simplified this loop and found that if I use int value or constant value to fill data[threadIdx.x] in loop it works fine. 我简化了此循环,发现如果我使用int值或常量值在循环中填充data [threadIdx.x],则可以正常工作。 But if I use double type value it returns an error. 但是,如果我使用double类型的值,它将返回错误。
Advice : if you are not correctly coping your data from host to device, you can get "warning: Cuda API error detected: cudaLaunch returned (0x7)" message when you use Nsight or you can get segmentation fault error when you run your app from terminal 建议 :如果您没有正确地将数据从主机复制到设备,则在使用Nsight时会收到“警告:检测到Cuda API错误:cudaLaunch返回(0x7)”消息,或者从以下位置运行应用程序时可能会出现分段错误错误终奌站
__global__ void sumSeries(double* dSum,int* totalThreadNumber){
volatile __shared__ double data[768];
double var=0;
data[threadIdx.x]=0;
for ( int i = 10 ; i < 20 ;++i){
var=i;
data[threadIdx.x] += (var)/(var*var+1);
__syncthreads();
}
}
Why it does not work? 为什么不起作用?
int main() {
int threadsPerBlock=768;
int blockCount=8;
int *hostThreadNumber=new int ;
*hostThreadNumber=threadsPerBlock*blockCount;
int* deviceThreadNumber=NULL;
double* deviceSum=NULL;
double* hostSum=(double*)malloc(blockCount);
cudaError_t cuerr=cudaDeviceSetCacheConfig(cudaFuncCachePreferShared);
if (cuerr != cudaSuccess){
std::cout<<"Cant SetCacheConfig: "<<cudaGetErrorString(cuerr)<<std::endl;
return -1;
}
cuerr=cudaMalloc(&deviceSum,blockCount*sizeof(double));//размер дабла*число блоков
if (cuerr != cudaSuccess){
std::cout<<"Cant allocate memory for deviceSum: "<<cudaGetErrorString(cuerr)<<std::endl;
return -1;
}
cuerr=cudaMalloc(&deviceThreadNumber,sizeof(int));
if (cuerr != cudaSuccess){
std::cout<<"Cant allocate memory for deviceThreadNumber: "<<cudaGetErrorString(cuerr)<<std::endl;
return -1;
}
cuerr = cudaMemcpy(deviceSum,hostSum,blockCount*sizeof(double),cudaMemcpyHostToDevice);
if (cuerr != cudaSuccess){
std::cout<<"Can not copy hostSum to device: "<<cudaGetErrorString(cuerr)<<std::endl;
return -1;
}
cuerr = cudaMemcpy(deviceThreadNumber,hostThreadNumber,sizeof(int),cudaMemcpyHostToDevice);
if (cuerr != cudaSuccess){
std::cout<<"Can not copy hostThreadNumber to device: "<<cudaGetErrorString(cuerr)<<std::endl;
return -1;
}
sumSeries<<<dim3(blockCount),dim3(threadsPerBlock)>>>(deviceSum,deviceThreadNumber);
cuerr=cudaGetLastError();
if (cuerr != cudaSuccess){
std::cout<<"Cuda kernel error: "<<cudaGetErrorString(cuerr)<<std::endl;
return -1;
}
cuerr= cudaDeviceSynchronize();
if (cuerr != cudaSuccess){
std::cout<<"Can not synchronize cuda kernel : "<<cudaGetErrorString(cuerr)<<std::endl;
return -1;
}
cuerr= cudaMemcpy(hostSum,deviceSum,blockCount*sizeof(double),cudaMemcpyDeviceToHost);
if (cuerr != cudaSuccess){
std::cout<<"Can not copy data to host: "<<cudaGetErrorString(cuerr)<<std::endl;
return -1;
}
cudaFree(deviceSum);
cudaFree(deviceThreadNumber);
return 0;
}
You just allocated 8 bytes memory for hostSum
您刚刚为hostSum
分配了8个字节的内存
double* hostSum=(double*)malloc(blockCount)
That's wrong if I assumed that you want to allocate blockCount * sizeof(double)
bytes for it, because you allocate this amount of memory for deviceSum
and uses it for the memory copy between host and device. 如果我假设您想为其分配blockCount * sizeof(double)
个字节,那是错误的,因为您为deviceSum
分配了此内存量,并将其用于主机和设备之间的内存副本。
cuerr = cudaMalloc(&deviceSum,blockCount*sizeof(double));
cuerr = cudaMemcpy(deviceSum,hostSum,blockCount*sizeof(double),cudaMemcpyHostToDevice);
cuerr= cudaMemcpy(hostSum,deviceSum,blockCount*sizeof(double),cudaMemcpyDeviceToHost);
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.