[英]How to use Cuda Memory 3D using cudaMalloc3D
I am learning 3D array manipulation in cuda. 我正在学习cuda中的3D数组操纵。 I implemented the following code, however I cannot get the expected result.
我实现了以下代码,但是无法获得预期的结果。 That is I am taking the array and changing the elements from 0 to 1. I tried to find the mistake but I cannot locate it.
那就是我正在使用数组并将元素从0更改为1。我试图找到错误,但是找不到它。 Can someone point where my mistake is in the code.
有人可以指出我的错误在代码中的位置吗?
int iDivUp(int a, int b) {
return ((a % b) != 0) ? (a / b + 1) : (a / b);
}
__global__
void kernel(cudaPitchedPtr d_pitched_ptr, int COLS, int ROWS, int D) {
int t_idx = threadIdx.x + blockIdx.x * blockDim.x;
int t_idy = threadIdx.y + blockIdx.y * blockDim.y;
char* d_ptr = static_cast<char*>(d_pitched_ptr.ptr);
size_t pitch = d_pitched_ptr.pitch;
float* element = (float*)(d_ptr + t_idy * pitch) + t_idx;
element[0] = 1;
element[1] = 1;
element[2] = 1;
}
void iFilter() {
const int ROWS = 100
const int COLS = 120
const int DEPTH = 3;
int pixels[COLS][ROWS][DEPTH];
for (int j = 0; j < ROWS; j++) {
for (int i = 0; i < COLS; i++) {
for (int k = 0; k < DEPTH; k++) {
pixels[i][j][k] = 0;
}
}
}
cudaExtent extent = make_cudaExtent(COLS * sizeof(int), ROWS, DEPTH);
cudaPitchedPtr d_pitched_ptr;
cudaMalloc3D(&d_pitched_ptr, extent);
cudaMemcpy3DParms d_parms = {0};
d_parms.srcPtr.ptr = pixels;
d_parms.srcPtr.pitch = COLS * sizeof(int);
d_parms.srcPtr.xsize = COLS;
d_parms.srcPtr.ysize = ROWS;
d_parms.dstPtr.ptr = d_pitched_ptr.ptr;
d_parms.dstPtr.pitch = d_pitched_ptr.pitch;
d_parms.dstPtr.xsize = COLS;
d_parms.dstPtr.ysize = ROWS;
d_parms.extent.width = COLS * sizeof(int);
d_parms.extent.height = ROWS;
d_parms.extent.depth = DEPTH;
d_parms.kind = cudaMemcpyHostToDevice;
cudaMemcpy3D(&d_parms);
dim3 block_size(blocksize, blocksize);
dim3 grid_size(iDivUp(COLS, blocksize), iDivUp(ROWS, blocksize));
kernel<<<grid_size, block_size>>>(
d_pitched_ptr, COLS, ROWS, DEPTH);
int download_pixels[COLS][ROWS][DEPTH];
d_parms.srcPtr.ptr = d_pitched_ptr.ptr;
d_parms.srcPtr.pitch = d_pitched_ptr.pitch;
d_parms.dstPtr.ptr = download_pixels;
d_parms.dstPtr.pitch = COLS * sizeof(int);
d_parms.kind = cudaMemcpyDeviceToHost;
cudaMemcpy3D(&d_parms);
for (int j = 0; j < ROWS; j++) {
for (int i = 0; i < COLS; i++) {
for (int k = 0; k < DEPTH; k++) {
image.at<cv::Vec3b>(j, i)[k] = download_pixels[i][j][k];
std::cout << download_pixels[i][j][k] << " ";
}
std::cout << "\n";
}
}
}
OUTPUT: I get all 0 instead of 1 输出:我得到的全部是0而不是1
The only error I could find in your code is that you are passing an int
array to the kernel, but inside the kernel you are manipulating it as if it were a float
array: 我在您的代码中发现的唯一错误是,您正在将一个
int
数组传递给内核,但是在内核内部您正在对其进行操作,就好像它是一个float
组一样:
float* element = (float*)(d_ptr + t_idy * pitch) + t_idx;
When I fix that issue, the test code I created from yours runs without error, producing all 1 values at the final test: 解决此问题后,我从您创建的测试代码将运行无错误,并在最终测试中生成所有1个值:
$ cat t1114.cu
#include <iostream>
const int blocksize = 16;
int iDivUp(int a, int b) {
return ((a % b) != 0) ? (a / b + 1) : (a / b);
}
__global__
void kernel(cudaPitchedPtr d_pitched_ptr, int COLS, int ROWS, int D) {
int t_idx = threadIdx.x + blockIdx.x * blockDim.x;
int t_idy = threadIdx.y + blockIdx.y * blockDim.y;
char* d_ptr = static_cast<char*>(d_pitched_ptr.ptr);
size_t pitch = d_pitched_ptr.pitch;
int * element = (int *)(d_ptr + t_idy * pitch) + t_idx;
element[0] = 1;
element[1] = 1;
element[2] = 1;
}
int main() {
const int ROWS = 100;
const int COLS = 120;
const int DEPTH = 3;
int pixels[COLS][ROWS][DEPTH];
for (int j = 0; j < ROWS; j++) {
for (int i = 0; i < COLS; i++) {
for (int k = 0; k < DEPTH; k++) {
pixels[i][j][k] = 0;
}
}
}
cudaExtent extent = make_cudaExtent(COLS * sizeof(int), ROWS, DEPTH);
cudaPitchedPtr d_pitched_ptr;
cudaMalloc3D(&d_pitched_ptr, extent);
cudaMemcpy3DParms d_parms = {0};
d_parms.srcPtr.ptr = pixels;
d_parms.srcPtr.pitch = COLS * sizeof(int);
d_parms.srcPtr.xsize = COLS;
d_parms.srcPtr.ysize = ROWS;
d_parms.dstPtr.ptr = d_pitched_ptr.ptr;
d_parms.dstPtr.pitch = d_pitched_ptr.pitch;
d_parms.dstPtr.xsize = COLS;
d_parms.dstPtr.ysize = ROWS;
d_parms.extent.width = COLS * sizeof(int);
d_parms.extent.height = ROWS;
d_parms.extent.depth = DEPTH;
d_parms.kind = cudaMemcpyHostToDevice;
cudaMemcpy3D(&d_parms);
dim3 block_size(blocksize, blocksize);
dim3 grid_size(iDivUp(COLS, blocksize), iDivUp(ROWS, blocksize));
kernel<<<grid_size, block_size>>>(
d_pitched_ptr, COLS, ROWS, DEPTH);
int download_pixels[COLS][ROWS][DEPTH];
d_parms.srcPtr.ptr = d_pitched_ptr.ptr;
d_parms.srcPtr.pitch = d_pitched_ptr.pitch;
d_parms.dstPtr.ptr = download_pixels;
d_parms.dstPtr.pitch = COLS * sizeof(int);
d_parms.kind = cudaMemcpyDeviceToHost;
cudaMemcpy3D(&d_parms);
for (int j = 0; j < ROWS; j++) {
for (int i = 0; i < COLS; i++) {
for (int k = 0; k < DEPTH; k++) {
if (download_pixels[i][j][k] != 1) std::cout << i << "," << j << "," << k << ": " << download_pixels[i][j][k] << " error! " << std::endl; return 1;
}
}
}
return 0;
}
$ nvcc -o t1114 t1114.cu
$ cuda-memcheck ./t1114
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.