[英]'an illegal memory access' when trying to write to a 2D array allocated using cudaMalloc3D
[英]How to use Cuda Memory 3D using cudaMalloc3D
我正在學習cuda中的3D數組操縱。 我實現了以下代碼,但是無法獲得預期的結果。 那就是我正在使用數組並將元素從0更改為1。我試圖找到錯誤,但是找不到它。 有人可以指出我的錯誤在代碼中的位置嗎?
int iDivUp(int a, int b) {
return ((a % b) != 0) ? (a / b + 1) : (a / b);
}
__global__
void kernel(cudaPitchedPtr d_pitched_ptr, int COLS, int ROWS, int D) {
int t_idx = threadIdx.x + blockIdx.x * blockDim.x;
int t_idy = threadIdx.y + blockIdx.y * blockDim.y;
char* d_ptr = static_cast<char*>(d_pitched_ptr.ptr);
size_t pitch = d_pitched_ptr.pitch;
float* element = (float*)(d_ptr + t_idy * pitch) + t_idx;
element[0] = 1;
element[1] = 1;
element[2] = 1;
}
void iFilter() {
const int ROWS = 100
const int COLS = 120
const int DEPTH = 3;
int pixels[COLS][ROWS][DEPTH];
for (int j = 0; j < ROWS; j++) {
for (int i = 0; i < COLS; i++) {
for (int k = 0; k < DEPTH; k++) {
pixels[i][j][k] = 0;
}
}
}
cudaExtent extent = make_cudaExtent(COLS * sizeof(int), ROWS, DEPTH);
cudaPitchedPtr d_pitched_ptr;
cudaMalloc3D(&d_pitched_ptr, extent);
cudaMemcpy3DParms d_parms = {0};
d_parms.srcPtr.ptr = pixels;
d_parms.srcPtr.pitch = COLS * sizeof(int);
d_parms.srcPtr.xsize = COLS;
d_parms.srcPtr.ysize = ROWS;
d_parms.dstPtr.ptr = d_pitched_ptr.ptr;
d_parms.dstPtr.pitch = d_pitched_ptr.pitch;
d_parms.dstPtr.xsize = COLS;
d_parms.dstPtr.ysize = ROWS;
d_parms.extent.width = COLS * sizeof(int);
d_parms.extent.height = ROWS;
d_parms.extent.depth = DEPTH;
d_parms.kind = cudaMemcpyHostToDevice;
cudaMemcpy3D(&d_parms);
dim3 block_size(blocksize, blocksize);
dim3 grid_size(iDivUp(COLS, blocksize), iDivUp(ROWS, blocksize));
kernel<<<grid_size, block_size>>>(
d_pitched_ptr, COLS, ROWS, DEPTH);
int download_pixels[COLS][ROWS][DEPTH];
d_parms.srcPtr.ptr = d_pitched_ptr.ptr;
d_parms.srcPtr.pitch = d_pitched_ptr.pitch;
d_parms.dstPtr.ptr = download_pixels;
d_parms.dstPtr.pitch = COLS * sizeof(int);
d_parms.kind = cudaMemcpyDeviceToHost;
cudaMemcpy3D(&d_parms);
for (int j = 0; j < ROWS; j++) {
for (int i = 0; i < COLS; i++) {
for (int k = 0; k < DEPTH; k++) {
image.at<cv::Vec3b>(j, i)[k] = download_pixels[i][j][k];
std::cout << download_pixels[i][j][k] << " ";
}
std::cout << "\n";
}
}
}
輸出:我得到的全部是0而不是1
我在您的代碼中發現的唯一錯誤是,您正在將一個int
數組傳遞給內核,但是在內核內部您正在對其進行操作,就好像它是一個float
組一樣:
float* element = (float*)(d_ptr + t_idy * pitch) + t_idx;
解決此問題后,我從您創建的測試代碼將運行無錯誤,並在最終測試中生成所有1個值:
$ cat t1114.cu
#include <iostream>
const int blocksize = 16;
int iDivUp(int a, int b) {
return ((a % b) != 0) ? (a / b + 1) : (a / b);
}
__global__
void kernel(cudaPitchedPtr d_pitched_ptr, int COLS, int ROWS, int D) {
int t_idx = threadIdx.x + blockIdx.x * blockDim.x;
int t_idy = threadIdx.y + blockIdx.y * blockDim.y;
char* d_ptr = static_cast<char*>(d_pitched_ptr.ptr);
size_t pitch = d_pitched_ptr.pitch;
int * element = (int *)(d_ptr + t_idy * pitch) + t_idx;
element[0] = 1;
element[1] = 1;
element[2] = 1;
}
int main() {
const int ROWS = 100;
const int COLS = 120;
const int DEPTH = 3;
int pixels[COLS][ROWS][DEPTH];
for (int j = 0; j < ROWS; j++) {
for (int i = 0; i < COLS; i++) {
for (int k = 0; k < DEPTH; k++) {
pixels[i][j][k] = 0;
}
}
}
cudaExtent extent = make_cudaExtent(COLS * sizeof(int), ROWS, DEPTH);
cudaPitchedPtr d_pitched_ptr;
cudaMalloc3D(&d_pitched_ptr, extent);
cudaMemcpy3DParms d_parms = {0};
d_parms.srcPtr.ptr = pixels;
d_parms.srcPtr.pitch = COLS * sizeof(int);
d_parms.srcPtr.xsize = COLS;
d_parms.srcPtr.ysize = ROWS;
d_parms.dstPtr.ptr = d_pitched_ptr.ptr;
d_parms.dstPtr.pitch = d_pitched_ptr.pitch;
d_parms.dstPtr.xsize = COLS;
d_parms.dstPtr.ysize = ROWS;
d_parms.extent.width = COLS * sizeof(int);
d_parms.extent.height = ROWS;
d_parms.extent.depth = DEPTH;
d_parms.kind = cudaMemcpyHostToDevice;
cudaMemcpy3D(&d_parms);
dim3 block_size(blocksize, blocksize);
dim3 grid_size(iDivUp(COLS, blocksize), iDivUp(ROWS, blocksize));
kernel<<<grid_size, block_size>>>(
d_pitched_ptr, COLS, ROWS, DEPTH);
int download_pixels[COLS][ROWS][DEPTH];
d_parms.srcPtr.ptr = d_pitched_ptr.ptr;
d_parms.srcPtr.pitch = d_pitched_ptr.pitch;
d_parms.dstPtr.ptr = download_pixels;
d_parms.dstPtr.pitch = COLS * sizeof(int);
d_parms.kind = cudaMemcpyDeviceToHost;
cudaMemcpy3D(&d_parms);
for (int j = 0; j < ROWS; j++) {
for (int i = 0; i < COLS; i++) {
for (int k = 0; k < DEPTH; k++) {
if (download_pixels[i][j][k] != 1) std::cout << i << "," << j << "," << k << ": " << download_pixels[i][j][k] << " error! " << std::endl; return 1;
}
}
}
return 0;
}
$ nvcc -o t1114 t1114.cu
$ cuda-memcheck ./t1114
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.