[英]Cuda - 3D block & grid dimension confusion - Another one
在下面的簡單示例中,我將使用cudaMalloc3D
在設備上分配內存,並將3D數據的每個體素加1,只要我使用對稱3D卷, cudaMalloc3D
可以正常工作。
主機代碼如下所示:
int main(void)
{
typedef float PixelType;
// Set up test data
dim3 image_dimensions = dim3(32, 32, 32);
size_t num_elements = image_dimensions.x * image_dimensions.y * image_dimensions.z;
PixelType *image_data = new float[num_elements];
for(int i = 0; i < num_elements; ++i)
{
image_data[i] = float(i);
}
// Allocate 3D memory on the device
cudaExtent volumeSizeBytes = make_cudaExtent(sizeof(PixelType) * image_dimensions.x, image_dimensions.y, image_dimensions.z);
cudaPitchedPtr devicePitchedPointer;
cudaMalloc3D(&devicePitchedPointer, volumeSizeBytes);
cudaMemset3D(devicePitchedPointer, 1.0f, volumeSizeBytes);
// Copy image data from the host to the device
cudaMemcpy3DParms copy_params_host_to_device = {0};
copy_params_host_to_device.srcPtr = make_cudaPitchedPtr((void *)image_data, sizeof(PixelType) * image_dimensions.x, image_dimensions.y, image_dimensions.z);
copy_params_host_to_device.dstPtr = devicePitchedPointer;
copy_params_host_to_device.extent = volumeSizeBytes;
copy_params_host_to_device.kind = cudaMemcpyHostToDevice;
cudaMemcpy3D(©_params_host_to_device);
// Kernel Launch Configuration
dim3 threads_per_block = dim3(8, 8, 8);
dim3 blocks_per_grid = dim3((image_dimensions.x + threads_per_block.x - 1) / threads_per_block.x, (image_dimensions.y + threads_per_block.y - 1) / threads_per_block.y, (image_dimensions.z + threads_per_block.z - 1) / threads_per_block.z);
extract_patches_from_image_data<<<blocks_per_grid, threads_per_block>>>(devicePitchedPointer, image_dimensions);
cudaDeviceSynchronize();
// Copy image data back from the device to the host
cudaMemcpy3DParms copy_params_device_to_host = {0};
copy_params_device_to_host.srcPtr = devicePitchedPointer;
copy_params_device_to_host.dstPtr = make_cudaPitchedPtr((void *)image_data, sizeof(PixelType) * image_dimensions.x, image_dimensions.y, image_dimensions.z);
copy_params_device_to_host.extent = volumeSizeBytes;
copy_params_device_to_host.kind = cudaMemcpyDeviceToHost;
cudaMemcpy3D(©_params_device_to_host);
// Check image data
for(int i = 0; i < num_elements; ++i)
{
std::cout << "Element: " << i << " - " << image_data[i] << std::endl;
}
// Free Memory
cudaFree(devicePitchedPointer.ptr);
delete [] image_data;
}
用於遞增所有值的相應內核:
__global__ void extract_patches_from_image_data(cudaPitchedPtr devicePitchedPointer, dim3 image_dimensions)
{
// Index Calculation
int x = threadIdx.x + blockDim.x * blockIdx.x;
int y = threadIdx.y + blockDim.y * blockIdx.y;
int z = threadIdx.z + blockDim.z * blockIdx.z;
// Get attributes from device pitched pointer
char *devicePointer = (char *)devicePitchedPointer.ptr;
size_t pitch = devicePitchedPointer.pitch;
size_t slicePitch = pitch * image_dimensions.y;
// Loop over image data
if(z < image_dimensions.z)
{
char *current_slice_index = devicePointer + z * slicePitch;
if(y < image_dimensions.y)
{
// Get data array containing all elements from the current row
PixelType *current_row = (PixelType *)(current_slice_index + y * pitch);
if(x < image_dimensions.x)
{
current_row[x] = current_row[x] + 1.0f;
// Get values of all all neighbors
}
}
}
}
只要我保持image_dimensions
對稱,例如( image_dimensions
),一切就可以正常工作。 當我嘗試使用(32,32,33)時,它可以正常工作直到體素33759,以下值保持不變。 現在我的問題是我應該如何修改我的代碼以使用非對稱數據。
float
傳遞給cudaMemset3D
。 如果您打算將每個浮動量設置為此值,那將不起作用。 cudaMemset3D的工作方式類似於主機memset
函數。 它采用unsigned char
值並設置unsigned char
數量。 您不能使用此方法正確地將float
值初始化為1.0f。 但這也不是問題的症結所在。 make_cudaPitchedPtr
函數。 請查閱文檔 。 最后兩個參數應分別為x
和y
尺寸,而不是y
和z
。 您的代碼中有兩個實例。 通過修改make_cudaPitchedPtr
的兩種用法,我能夠使您的代碼正確運行
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.