[英]CUDA memory managment / pointers in classes problems
我正在努力解決一些內存管理問題。 將結果復制到主機時,我一直收到“未指定的啟動失敗”消息。
我的代碼非常簡單-它在每個線程中生成兩個uint並將它們相乘。 我有提供隨機數的課程:
class CuRandCuRandomNumberProvider :
{
public:
CuRandCuRandomNumberProvider(dim3 numBlocks, dim3 threadsPerBlock);
CuRandCuRandomNumberProvider(dim3 numBlocks, dim3 threadsPerBlock, unsigned int seed);
__device__ unsigned int GetRandomNumber();
~CuRandCuRandomNumberProvider();
protected:
curandState * states;
__device__ bool IsPrime(unsigned int number);
};
CuRandCuRandomNumberProvider::CuRandCuRandomNumberProvider(dim3 numBlocks, dim3 threadsPerBlock)
{
int numberOfThreads = threadsPerBlock.x * threadsPerBlock.y * numBlocks.x * numBlocks.y;
std::cout << numberOfThreads << std::endl;
cudaMalloc ( &this->states, numberOfThreads*sizeof( curandState ) );
setup_kernel <<< numBlocks, threadsPerBlock >>> ( this->states, time(NULL) );
}
__device__ unsigned int CuRandCuRandomNumberProvider::GetRandomNumber()
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
register float r = curand_uniform(&this->states[offset]);
return 0 + ((double)UINT_MAX) * r;
}
setup_kernel存儲在頭文件中,如下所示:
__global__ void setup_kernel ( curandState * state, unsigned long seed )
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
curand_init ( seed, offset, 0, &state[offset] );
}
我的主內核非常簡單,看起來像這樣:
__global__ void InitKernel(uint3 * ptr, CuRandCuRandomNumberProvider * provider)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
ptr[offset].x = provider->GetRandomNumber();
ptr[offset].y = provider->GetRandomNumber();
ptr[offset].z = ptr[offset].x * ptr[offset].y;
}
最后一個cudaMemcpy引起問題的main中的執行是:
uint3 * pqnD;
uint3 * pqnH = (uint3*)malloc(sizeof(uint3) * numberOfThreads );
memset(pqnH,0,sizeof(uint3) * numberOfThreads );
HANDLE_ERROR( cudaMalloc( (void**)&pqnD, sizeof(uint3) * numberOfThreads ));
CuRandCuRandomNumberProvider * provider = new CuRandCuRandomNumberProvider(numBlocks, threadsPerBlock);
InitKernel<<<numBlocks, threadsPerBlock>>>(pqnD, provider);
HANDLE_ERROR( cudaMemcpy( pqnH, pqnD, sizeof(uint3) * numberOfThreads, cudaMemcpyDeviceToHost ) ); // this line causes error
HANDLE_ERROR( cudaFree( pqnD ) );
如果我一切都做得很出色,例如:
uint3 * pqnD;
uint3 * pqnH = (uint3*)malloc(sizeof(uint3) * numberOfThreads );
memset(pqnH,0,sizeof(uint3) * numberOfThreads );
HANDLE_ERROR( cudaMalloc( (void**)&pqnD, sizeof(uint3) * numberOfThreads ));
curandState * states;
cudaMalloc ( &states, numberOfThreads*sizeof( curandState ) );
setup_kernel <<< numBlocks, threadsPerBlock >>> ( states, time(NULL) );
CuRandCuRandomNumberProvider * provider = new CuRandCuRandomNumberProvider(numBlocks, threadsPerBlock, states);
InitKernel2<<<numBlocks, threadsPerBlock>>>(pqnD, states);
HANDLE_ERROR( cudaMemcpy( pqnH, pqnD, sizeof(uint3) * numberOfThreads, cudaMemcpyDeviceToHost ) );
HANDLE_ERROR( cudaFree( pqnD ) );
setup_kernel完全相同,而InitKernel2如下所示:
__global__ void InitKernel2(uint3 * ptr, curandState * states)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
ptr[offset].x = GetRandomNumber(states);
ptr[offset].y = GetRandomNumber(states);
ptr[offset].z = ptr[offset].x * ptr[offset].y;
}
和GetRandomNumber是:
__device__ unsigned int GetRandomNumber(curandState * states)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
register float r = curand_uniform(&states[offset]);
return 0 + ((double)UINT_MAX) * r;
}
一切都是魅力。 有人知道我在做什么錯嗎? 我已經為此努力了幾個小時。 我認為這可能與內存管理或指針傳遞有關,但我不知道會是什么。
請幫忙 :)!
這是非法的:
CuRandCuRandomNumberProvider * provider = new CuRandCuRandomNumberProvider(numBlocks, threadsPerBlock);
InitKernel<<<numBlocks, threadsPerBlock>>>(pqnD, provider);
provider
是要在主機上分配的變量。 將該指針傳遞給設備並在設備代碼中取消引用:
ptr[offset].x = provider->GetRandomNumber();
(最終導致:)
register float r = curand_uniform(&this->states[offset]);
是非法的。
由於您似乎想在主機上設置對象(類CuRandCuRandomNumberProvider
)並將其傳遞給設備,因此一種可能的解決方法是按值而不是按指針傳遞對象。 這將需要在主要方面進行一些更改:
CuRandCuRandomNumberProvider provider(numBlocks, threadsPerBlock);
在InitKernel中:
__global__ void InitKernel(uint3 * ptr, CuRandCuRandomNumberProvider provider) // change
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
ptr[offset].x = provider.GetRandomNumber(); // change
ptr[offset].y = provider.GetRandomNumber(); // change
ptr[offset].z = ptr[offset].x * ptr[offset].y;
}
在CuRandCuRandomNumberProvider :: GetRandomNumber()中:
__device__ unsigned int CuRandCuRandomNumberProvider::GetRandomNumber()
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
register float r = curand_uniform(&(states[offset])); // change
return 0 + ((double)UINT_MAX) * r;
}
(並且我也刪除了析構函數原型,因為它正在妨礙。)
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.