I am learning 3D array manipulation in cuda. I implemented the following code, however I cannot get the expected result. That is I am taking the array and changing the elements from 0 to 1. I tried to find the mistake but I cannot locate it. Can someone point where my mistake is in the code.
int iDivUp(int a, int b) {
return ((a % b) != 0) ? (a / b + 1) : (a / b);
}
__global__
void kernel(cudaPitchedPtr d_pitched_ptr, int COLS, int ROWS, int D) {
int t_idx = threadIdx.x + blockIdx.x * blockDim.x;
int t_idy = threadIdx.y + blockIdx.y * blockDim.y;
char* d_ptr = static_cast<char*>(d_pitched_ptr.ptr);
size_t pitch = d_pitched_ptr.pitch;
float* element = (float*)(d_ptr + t_idy * pitch) + t_idx;
element[0] = 1;
element[1] = 1;
element[2] = 1;
}
void iFilter() {
const int ROWS = 100
const int COLS = 120
const int DEPTH = 3;
int pixels[COLS][ROWS][DEPTH];
for (int j = 0; j < ROWS; j++) {
for (int i = 0; i < COLS; i++) {
for (int k = 0; k < DEPTH; k++) {
pixels[i][j][k] = 0;
}
}
}
cudaExtent extent = make_cudaExtent(COLS * sizeof(int), ROWS, DEPTH);
cudaPitchedPtr d_pitched_ptr;
cudaMalloc3D(&d_pitched_ptr, extent);
cudaMemcpy3DParms d_parms = {0};
d_parms.srcPtr.ptr = pixels;
d_parms.srcPtr.pitch = COLS * sizeof(int);
d_parms.srcPtr.xsize = COLS;
d_parms.srcPtr.ysize = ROWS;
d_parms.dstPtr.ptr = d_pitched_ptr.ptr;
d_parms.dstPtr.pitch = d_pitched_ptr.pitch;
d_parms.dstPtr.xsize = COLS;
d_parms.dstPtr.ysize = ROWS;
d_parms.extent.width = COLS * sizeof(int);
d_parms.extent.height = ROWS;
d_parms.extent.depth = DEPTH;
d_parms.kind = cudaMemcpyHostToDevice;
cudaMemcpy3D(&d_parms);
dim3 block_size(blocksize, blocksize);
dim3 grid_size(iDivUp(COLS, blocksize), iDivUp(ROWS, blocksize));
kernel<<<grid_size, block_size>>>(
d_pitched_ptr, COLS, ROWS, DEPTH);
int download_pixels[COLS][ROWS][DEPTH];
d_parms.srcPtr.ptr = d_pitched_ptr.ptr;
d_parms.srcPtr.pitch = d_pitched_ptr.pitch;
d_parms.dstPtr.ptr = download_pixels;
d_parms.dstPtr.pitch = COLS * sizeof(int);
d_parms.kind = cudaMemcpyDeviceToHost;
cudaMemcpy3D(&d_parms);
for (int j = 0; j < ROWS; j++) {
for (int i = 0; i < COLS; i++) {
for (int k = 0; k < DEPTH; k++) {
image.at<cv::Vec3b>(j, i)[k] = download_pixels[i][j][k];
std::cout << download_pixels[i][j][k] << " ";
}
std::cout << "\n";
}
}
}
OUTPUT: I get all 0 instead of 1
The only error I could find in your code is that you are passing an int
array to the kernel, but inside the kernel you are manipulating it as if it were a float
array:
float* element = (float*)(d_ptr + t_idy * pitch) + t_idx;
When I fix that issue, the test code I created from yours runs without error, producing all 1 values at the final test:
$ cat t1114.cu
#include <iostream>
const int blocksize = 16;
int iDivUp(int a, int b) {
return ((a % b) != 0) ? (a / b + 1) : (a / b);
}
__global__
void kernel(cudaPitchedPtr d_pitched_ptr, int COLS, int ROWS, int D) {
int t_idx = threadIdx.x + blockIdx.x * blockDim.x;
int t_idy = threadIdx.y + blockIdx.y * blockDim.y;
char* d_ptr = static_cast<char*>(d_pitched_ptr.ptr);
size_t pitch = d_pitched_ptr.pitch;
int * element = (int *)(d_ptr + t_idy * pitch) + t_idx;
element[0] = 1;
element[1] = 1;
element[2] = 1;
}
int main() {
const int ROWS = 100;
const int COLS = 120;
const int DEPTH = 3;
int pixels[COLS][ROWS][DEPTH];
for (int j = 0; j < ROWS; j++) {
for (int i = 0; i < COLS; i++) {
for (int k = 0; k < DEPTH; k++) {
pixels[i][j][k] = 0;
}
}
}
cudaExtent extent = make_cudaExtent(COLS * sizeof(int), ROWS, DEPTH);
cudaPitchedPtr d_pitched_ptr;
cudaMalloc3D(&d_pitched_ptr, extent);
cudaMemcpy3DParms d_parms = {0};
d_parms.srcPtr.ptr = pixels;
d_parms.srcPtr.pitch = COLS * sizeof(int);
d_parms.srcPtr.xsize = COLS;
d_parms.srcPtr.ysize = ROWS;
d_parms.dstPtr.ptr = d_pitched_ptr.ptr;
d_parms.dstPtr.pitch = d_pitched_ptr.pitch;
d_parms.dstPtr.xsize = COLS;
d_parms.dstPtr.ysize = ROWS;
d_parms.extent.width = COLS * sizeof(int);
d_parms.extent.height = ROWS;
d_parms.extent.depth = DEPTH;
d_parms.kind = cudaMemcpyHostToDevice;
cudaMemcpy3D(&d_parms);
dim3 block_size(blocksize, blocksize);
dim3 grid_size(iDivUp(COLS, blocksize), iDivUp(ROWS, blocksize));
kernel<<<grid_size, block_size>>>(
d_pitched_ptr, COLS, ROWS, DEPTH);
int download_pixels[COLS][ROWS][DEPTH];
d_parms.srcPtr.ptr = d_pitched_ptr.ptr;
d_parms.srcPtr.pitch = d_pitched_ptr.pitch;
d_parms.dstPtr.ptr = download_pixels;
d_parms.dstPtr.pitch = COLS * sizeof(int);
d_parms.kind = cudaMemcpyDeviceToHost;
cudaMemcpy3D(&d_parms);
for (int j = 0; j < ROWS; j++) {
for (int i = 0; i < COLS; i++) {
for (int k = 0; k < DEPTH; k++) {
if (download_pixels[i][j][k] != 1) std::cout << i << "," << j << "," << k << ": " << download_pixels[i][j][k] << " error! " << std::endl; return 1;
}
}
}
return 0;
}
$ nvcc -o t1114 t1114.cu
$ cuda-memcheck ./t1114
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.