I am struggling with some memory management problem . I keep receiving "Unspecified launch failure" while copying results to host.
My code is quite simple - it generates two uints in each thread and multiplies them. I have class for providing a Random Number:
class CuRandCuRandomNumberProvider :
{
public:
CuRandCuRandomNumberProvider(dim3 numBlocks, dim3 threadsPerBlock);
CuRandCuRandomNumberProvider(dim3 numBlocks, dim3 threadsPerBlock, unsigned int seed);
__device__ unsigned int GetRandomNumber();
~CuRandCuRandomNumberProvider();
protected:
curandState * states;
__device__ bool IsPrime(unsigned int number);
};
CuRandCuRandomNumberProvider::CuRandCuRandomNumberProvider(dim3 numBlocks, dim3 threadsPerBlock)
{
int numberOfThreads = threadsPerBlock.x * threadsPerBlock.y * numBlocks.x * numBlocks.y;
std::cout << numberOfThreads << std::endl;
cudaMalloc ( &this->states, numberOfThreads*sizeof( curandState ) );
setup_kernel <<< numBlocks, threadsPerBlock >>> ( this->states, time(NULL) );
}
__device__ unsigned int CuRandCuRandomNumberProvider::GetRandomNumber()
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
register float r = curand_uniform(&this->states[offset]);
return 0 + ((double)UINT_MAX) * r;
}
setup_kernel is stored in header file and looks like this:
__global__ void setup_kernel ( curandState * state, unsigned long seed )
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
curand_init ( seed, offset, 0, &state[offset] );
}
My main kernel is very simple and looks like this:
__global__ void InitKernel(uint3 * ptr, CuRandCuRandomNumberProvider * provider)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
ptr[offset].x = provider->GetRandomNumber();
ptr[offset].y = provider->GetRandomNumber();
ptr[offset].z = ptr[offset].x * ptr[offset].y;
}
The execution in main where last cudaMemcpy causes problems is:
uint3 * pqnD;
uint3 * pqnH = (uint3*)malloc(sizeof(uint3) * numberOfThreads );
memset(pqnH,0,sizeof(uint3) * numberOfThreads );
HANDLE_ERROR( cudaMalloc( (void**)&pqnD, sizeof(uint3) * numberOfThreads ));
CuRandCuRandomNumberProvider * provider = new CuRandCuRandomNumberProvider(numBlocks, threadsPerBlock);
InitKernel<<<numBlocks, threadsPerBlock>>>(pqnD, provider);
HANDLE_ERROR( cudaMemcpy( pqnH, pqnD, sizeof(uint3) * numberOfThreads, cudaMemcpyDeviceToHost ) ); // this line causes error
HANDLE_ERROR( cudaFree( pqnD ) );
If i do everything explicily , like:
uint3 * pqnD;
uint3 * pqnH = (uint3*)malloc(sizeof(uint3) * numberOfThreads );
memset(pqnH,0,sizeof(uint3) * numberOfThreads );
HANDLE_ERROR( cudaMalloc( (void**)&pqnD, sizeof(uint3) * numberOfThreads ));
curandState * states;
cudaMalloc ( &states, numberOfThreads*sizeof( curandState ) );
setup_kernel <<< numBlocks, threadsPerBlock >>> ( states, time(NULL) );
CuRandCuRandomNumberProvider * provider = new CuRandCuRandomNumberProvider(numBlocks, threadsPerBlock, states);
InitKernel2<<<numBlocks, threadsPerBlock>>>(pqnD, states);
HANDLE_ERROR( cudaMemcpy( pqnH, pqnD, sizeof(uint3) * numberOfThreads, cudaMemcpyDeviceToHost ) );
HANDLE_ERROR( cudaFree( pqnD ) );
Where setup_kernel is exactly the same and InitKernel2 looks like:
__global__ void InitKernel2(uint3 * ptr, curandState * states)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
ptr[offset].x = GetRandomNumber(states);
ptr[offset].y = GetRandomNumber(states);
ptr[offset].z = ptr[offset].x * ptr[offset].y;
}
and GetRandomNumber is:
__device__ unsigned int GetRandomNumber(curandState * states)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
register float r = curand_uniform(&states[offset]);
return 0 + ((double)UINT_MAX) * r;
}
Everything works as a charm. Does anyone have a clue what am I doing wrong ? I've been struggling with this for hours. I thing it might be something with memory management or pointer passing, but i don't know what could it be.
Please help :)!
This is illegal:
CuRandCuRandomNumberProvider * provider = new CuRandCuRandomNumberProvider(numBlocks, threadsPerBlock);
InitKernel<<<numBlocks, threadsPerBlock>>>(pqnD, provider);
provider
is a variable that you are allocating on the host. Passing that pointer to the device and dereferencing it in device code:
ptr[offset].x = provider->GetRandomNumber();
(ultimately leading to:)
register float r = curand_uniform(&this->states[offset]);
is illegal.
Since you seem to want to set up the object (of class CuRandCuRandomNumberProvider
) on the host and pass it to the device, one possible fix would be to pass the object by value, rather than by pointer. This would necessitate a few changes, in main:
CuRandCuRandomNumberProvider provider(numBlocks, threadsPerBlock);
in InitKernel:
__global__ void InitKernel(uint3 * ptr, CuRandCuRandomNumberProvider provider) // change
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
ptr[offset].x = provider.GetRandomNumber(); // change
ptr[offset].y = provider.GetRandomNumber(); // change
ptr[offset].z = ptr[offset].x * ptr[offset].y;
}
in CuRandCuRandomNumberProvider::GetRandomNumber():
__device__ unsigned int CuRandCuRandomNumberProvider::GetRandomNumber()
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
register float r = curand_uniform(&(states[offset])); // change
return 0 + ((double)UINT_MAX) * r;
}
(and I deleted the destructor prototype too, as it was getting in the way.)
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.