發送3D數組到CUDA內核

Question

我將給出的代碼作為答案，以了解如何使用嵌套的for循環添加兩個2d（間距）數組？ 並嘗試將其用於3D而不是2D，並且還稍微更改了其他部分，現在看起來如下所示：

 __global__ void doSmth(int*** a) {
  for(int i=0; i<2; i++)
   for(int j=0; j<2; j++)
    for(int k=0; k<2; k++) 
     a[i][j][k]=i+j+k;
 }

 int main() {
  int*** h_c = (int***) malloc(2*sizeof(int**));
  for(int i=0; i<2; i++) {
   h_c[i] = (int**) malloc(2*sizeof(int*));
   for(int j=0; j<2; j++)
    GPUerrchk(cudaMalloc((void**)&h_c[i][j],2*sizeof(int)));
  }
  int*** d_c;
  GPUerrchk(cudaMalloc((void****)&d_c,2*sizeof(int**)));
  GPUerrchk(cudaMemcpy(d_c,h_c,2*sizeof(int**),cudaMemcpyHostToDevice));
  doSmth<<<1,1>>>(d_c);
  GPUerrchk(cudaPeekAtLastError());

  int res[2][2][2];
  for(int i=0; i<2; i++)
   for(int j=0; j<2; j++)
    GPUerrchk(cudaMemcpy(&res[i][j][0],
    h_c[i][j],2*sizeof(int),cudaMemcpyDeviceToHost));  

  for(int i=0; i<2; i++)
   for(int j=0; j<2; j++)
    for(int k=0; k<2; k++) 
     printf("[%d][%d][%d]=%d\n",i,j,k,res[i][j][k]);     
 }

在上面的代碼中，我為h_c的每個維度使用2作為大小，在實際的實現中，對於“ int ***”或更大維度的子數組的每個部分，我將擁有非常大的數目，而對於每個大小，它們將具有不同的大小。 我在內核調用之后的部分出現問題，在該部分嘗試將結果復制回res數組。 您能幫我解決問題嗎？ 請您以我上面編寫的方式顯示解決方案。 謝謝！

Answer 1

首先，我認為當他發表對您提到的上一個問題的答復時，達隆密斯並不打算代表良好的編碼。 因此，弄清楚如何將其擴展到3D可能不是最好的時間。 例如，為什么我們要編寫僅使用一個線程的程序？ 盡管可能會合理使用這種內核，但這並不是其中之一。 您的內核可以並行執行一堆獨立的工作，但是您可以將其全部強制到一個線程中並進行序列化。 並行工作的定義是：

a[i][j][k]=i+j+k;

讓我們弄清楚如何在GPU上並行處理它。

我要介紹的另一篇介紹性文章是，由於我們正在處理事先已知的大小問題，因此讓我們使用C來解決它們，並從我們的語言中獲得最大的收益。 在某些情況下，可能需要使用嵌套循環來執行cudaMalloc，但我認為這不是其中之一。

這是並行完成工作的代碼：

#include <stdio.h>
#include <stdlib.h>
// set a 3D volume
// To compile it with nvcc execute: nvcc -O2 -o set3d set3d.cu
//define the data set size (cubic volume)
#define DATAXSIZE 100
#define DATAYSIZE 100
#define DATAZSIZE 20
//define the chunk sizes that each threadblock will work on
#define BLKXSIZE 32
#define BLKYSIZE 4
#define BLKZSIZE 4

// for cuda error checking
#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            return 1; \
        } \
    } while (0)

// device function to set the 3D volume
__global__ void set(int a[][DATAYSIZE][DATAXSIZE])
{
    unsigned idx = blockIdx.x*blockDim.x + threadIdx.x;
    unsigned idy = blockIdx.y*blockDim.y + threadIdx.y;
    unsigned idz = blockIdx.z*blockDim.z + threadIdx.z;
    if ((idx < (DATAXSIZE)) && (idy < (DATAYSIZE)) && (idz < (DATAZSIZE))){
      a[idz][idy][idx] = idz+idy+idx;
      }
}

int main(int argc, char *argv[])
{
    typedef int nRarray[DATAYSIZE][DATAXSIZE];
    const dim3 blockSize(BLKXSIZE, BLKYSIZE, BLKZSIZE);
    const dim3 gridSize(((DATAXSIZE+BLKXSIZE-1)/BLKXSIZE), ((DATAYSIZE+BLKYSIZE-1)/BLKYSIZE), ((DATAZSIZE+BLKZSIZE-1)/BLKZSIZE));
// overall data set sizes
    const int nx = DATAXSIZE;
    const int ny = DATAYSIZE;
    const int nz = DATAZSIZE;
// pointers for data set storage via malloc
    nRarray *c; // storage for result stored on host
    nRarray *d_c;  // storage for result computed on device
// allocate storage for data set
    if ((c = (nRarray *)malloc((nx*ny*nz)*sizeof(int))) == 0) {fprintf(stderr,"malloc1 Fail \n"); return 1;}
// allocate GPU device buffers
    cudaMalloc((void **) &d_c, (nx*ny*nz)*sizeof(int));
    cudaCheckErrors("Failed to allocate device buffer");
// compute result
    set<<<gridSize,blockSize>>>(d_c);
    cudaCheckErrors("Kernel launch failure");
// copy output data back to host

    cudaMemcpy(c, d_c, ((nx*ny*nz)*sizeof(int)), cudaMemcpyDeviceToHost);
    cudaCheckErrors("CUDA memcpy failure");
// and check for accuracy
    for (unsigned i=0; i<nz; i++)
      for (unsigned j=0; j<ny; j++)
        for (unsigned k=0; k<nx; k++)
          if (c[i][j][k] != (i+j+k)) {
            printf("Mismatch at x= %d, y= %d, z= %d  Host= %d, Device = %d\n", i, j, k, (i+j+k), c[i][j][k]);
            return 1;
            }
    printf("Results check!\n");
    free(c);
    cudaFree(d_c);
    cudaCheckErrors("cudaFree fail");
    return 0;
}

既然您在注釋中已經要求過它，那么這里是我可以對您的代碼進行更改以使其起作用的最少數量的更改。 讓我們還提醒您自己，您所引用的上一個問題中的一些標准論評論：

“出於代碼復雜性和性能的原因，您真的不想這樣做，因為與使用線性內存的替代方法相比，在CUDA代碼中使用指針數組既困難又慢。”

“與使用線性內存相比，這是一個糟糕的主意。”

我必須在紙上畫出圖表，以確保所有指針復制正確。

#include <cstdio>
inline void GPUassert(cudaError_t code, char * file, int line, bool Abort=true)
{
    if (code != 0) {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code),file,line);
        if (Abort) exit(code);
    }
}

#define GPUerrchk(ans) { GPUassert((ans), __FILE__, __LINE__); }



 __global__ void doSmth(int*** a) {
  for(int i=0; i<2; i++)
   for(int j=0; j<2; j++)
    for(int k=0; k<2; k++)
     a[i][j][k]=i+j+k;
 }
 int main() {
  int*** h_c = (int***) malloc(2*sizeof(int**));
  for(int i=0; i<2; i++) {
   h_c[i] = (int**) malloc(2*sizeof(int*));
   for(int j=0; j<2; j++)
    GPUerrchk(cudaMalloc((void**)&h_c[i][j],2*sizeof(int)));
  }
  int ***h_c1 = (int ***) malloc(2*sizeof(int **));
  for (int i=0; i<2; i++){
    GPUerrchk(cudaMalloc((void***)&(h_c1[i]), 2*sizeof(int*)));
    GPUerrchk(cudaMemcpy(h_c1[i], h_c[i], 2*sizeof(int*), cudaMemcpyHostToDevice));
    }
  int*** d_c;
  GPUerrchk(cudaMalloc((void****)&d_c,2*sizeof(int**)));
  GPUerrchk(cudaMemcpy(d_c,h_c1,2*sizeof(int**),cudaMemcpyHostToDevice));
  doSmth<<<1,1>>>(d_c);
  GPUerrchk(cudaPeekAtLastError());
  int res[2][2][2];
  for(int i=0; i<2; i++)
   for(int j=0; j<2; j++)
    GPUerrchk(cudaMemcpy(&res[i][j][0], h_c[i][j],2*sizeof(int),cudaMemcpyDeviceToHost));

  for(int i=0; i<2; i++)
   for(int j=0; j<2; j++)
    for(int k=0; k<2; k++)
     printf("[%d][%d][%d]=%d\n",i,j,k,res[i][j][k]);
 }

簡而言之，我們必須執行以下連續序列：

malloc（在主機上）分配一個多維數組的指針，比問題的大小小一個維度，最后一個維度是指向設備而不是主機上分配給cudaMalloc的區域的一組指針。
創建另一個多維數組指針，該數組與上一步中創建的類相同，但比上一步中創建的小一維。 該數組還必須在設備上具有cudaMalloc的最終排名。
將上一個第二步中的最后一組主機指針復制到上一步中設備上分配的cudaMalloc區域中。
重復步驟2-3，直到最終得到指向多維數組指針的單個（主機）指針，所有這些指針現在都駐留在設備上。

發送3D數組到CUDA內核

問題描述

1 個解決方案

解決方案1
9 已采納 2012-10-16 23:31:41

發送3D數組到CUDA內核

問題描述

1 個解決方案

解決方案1 9 已采納 2012-10-16 23:31:41

解決方案1
9 已采納 2012-10-16 23:31:41