简体   繁体   English

内核CUDA中的2D数组

[英]2D array in the Kernel CUDA

I wanted to used 2D array on GPU as we do on CPU. 我想在GPU上使用2D数组,就像在CPU上一样。 Thus the below code. 因此下面的代码。 It executes without errors but returns some garbage values. 它执行没有错误但返回一些垃圾值。

Could anyone please suggest me what might have went wrong...! 任何人都可以建议我可能出了什么问题......!

Thank you. 谢谢。

#include<stdio.h>

__global__ void add2(int** da)
{
 int idx=threadIdx.x;
 int idy=threadIdx.y;

da[idx][idy]+=2;
// printf("It came here");
printf("%d \n",da[idx][idy]);
}

int main()
{
 int ha[2][2],**da, hb[2][2];
 size_t pitch;
 for(int i=0;i<2;i++)
    {
    for(int j=0;j<2;j++)
       ha[i][j]=0;
    }

 cudaMallocPitch((void**)&da, &pitch, 2*sizeof(int),2);
 cudaMemcpy2D(&da, 2*sizeof(int), ha, pitch, 2*sizeof(int), 2, cudaMemcpyHostToDevice);

 printf("Before kernel\n");
 for(int i=0;i<2;i++)
    {
     for(int j=0;j<2;j++)
     printf("%d ",ha[i][j]);
     printf("\n");
    }
 printf("\n");

 add2<<<2,2>>>(da);
 // gpuErrchk(cudaPeekAtLastError());
 // gpuErrchk(cudaDeviceSynchronize());

 cudaMemcpy2D(&hb, 2*sizeof(int), da, pitch, 2*sizeof(int), 2, cudaMemcpyDeviceToHost);

 printf("After kernel\n");
 for(int i=0;i<2;i++)
    {
     for(int j=0;j<2;j++)
    printf("%d ",hb[i][j]);
    printf("\n");
    }
  return 0;
}

One of the other approaches to 2D arrays is, if you think its nothing but the arrangement of the elements in memory. 2D阵列的其他方法之一是,如果您认为它只是内存中元素的排列。 The following code explains you such an approach with row-major order and more generalised way to write with proper error checking . 下面的代码解释了这种具有行主顺序的方法,以及通过适当的错误检查进行写入的更通用的方法。

 #include<stdio.h>

 #define NUM_ROWS 2
 #define NUM_COLS 2

 __global__ void add2(int* da, int iNumCol)
{
    int idx=threadIdx.x;
    int idy=threadIdx.y;

    da[(idx * iNumCol) + idy]+=2;
    // printf("It came here");
    //printf("%d \n",da[idx][idy]);
}

int main()
{
    int             ha[NUM_ROWS][NUM_COLS] ;
    int             *da ;
    int             hb[NUM_ROWS][NUM_COLS] ;
    int             iSize = NUM_ROWS * NUM_COLS * sizeof(int) ;
    cudaError_t     cuError = cudaSuccess ;
    dim3            dimGrid (1,1,1) ;
    dim3            dimBlock (NUM_ROWS, NUM_COLS, 1) ;

    for(int i=0;i<NUM_ROWS;i++)
    {
            for(int j=0;j<NUM_COLS;j++)
            {
                    ha[i][j]=0;
            }
    }

    cuError = cudaMalloc((void**)&da, iSize) ;
    if (cudaSuccess != cuError)
    {
            printf ("Failed to allocate memory\n") ;
            return 1 ;
    }
    cuError = cudaMemcpy(da, ha, iSize, cudaMemcpyHostToDevice);
    if (cudaSuccess != cuError)
    {
            cudaFree (da) ;
            printf ("Failed in Memcpy 1\n") ;
            return 1 ;
    }

    printf("Before kernel\n");
    for(int i=0;i<NUM_ROWS;i++)
    {
            for(int j=0;j<NUM_COLS;j++)
            {
                    printf("%d ",ha[i][j]);
            }
            printf("\n");
    }
    printf("\n");

    add2<<<dimGrid, dimBlock>>>(da, NUM_COLS);
    cuError = cudaGetLastError () ;
    if (cudaSuccess != cuError)
    {
            printf ("Failed in kernel launch and reason is %s\n", cudaGetErrorString(cuError)) ;
            return 1 ;
    }

    cuError = cudaMemcpy(hb, da, iSize, cudaMemcpyDeviceToHost);
    if (cudaSuccess != cuError)
    {
            cudaFree (da) ;
            printf ("Failed in Memcpy 2\n") ;
            return 1 ;
    }

    printf("After kernel\n");
    for(int i=0;i<NUM_ROWS;i++)
    {
            for(int j=0;j<NUM_COLS;j++)
            {
                    printf("%d ",hb[i][j]);
            }
            printf("\n");
    }
    cudaFree (da) ;

    return 0;
}

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM