How to copy a 2D array from Device to Host in Cuda?

Question

I want to copy the results of a fft operation from device to host.

This is what happens. The input is a pointer to a pointer to an float. the values get allocated during runtime. then it is transferred to the gpu and fft is calculated. Then the results are transferred to to float2 2D array. But the result i get is wrong. It contains all zero. So how can I overcome this issue ?

#define NRANK 2
#define BATCH 10

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cufft.h>
#include <stdio.h> 

#include <iostream>
#include <vector>

using namespace std;


float func(int,int){
              return 2.0f;  // some value get return. I have put a dummy value here
} 
int main()
    { 

    const size_t NX = 4;
    const size_t NY = 5;

    // Input array - host side
    float **a = new float*[NX];  

    for (int r = 0; r < NX; ++r)  // this can be also done on GPU
        {
        a[r] = new float[NY];
        for (int c = 0; c < NY; ++c)
            {            
                a[r][c] = func(r,c);         
            }
        }

    // Output array - host side  
    float2 c[NX][NY] = { 0 };


    cufftHandle plan;
    cufftComplex *data;   // Input and output arrays - device side
    int n[NRANK] = {NX, NY};

    // Transfer the data from host to device - have to do it like this becase
    // the array is a dynamic array
    cudaMalloc((void**)&data, sizeof(cufftComplex)*NX*(NY/2+1));
    for(int i=0; i<NX; ++i){
        cudaMemcpy(reinterpret_cast<float*>(data) + i*NY, a[i], sizeof(float)*NY,     
             cudaMemcpyHostToDevice);
     }

    /* Create a 2D FFT plan. */
    cufftPlanMany(&plan, NRANK, n,NULL, 1, 0,NULL, 1, 0,CUFFT_C2C,BATCH);
    cufftSetCompatibilityMode(plan, CUFFT_COMPATIBILITY_NATIVE);
    cufftExecC2C(plan, data, data, CUFFT_FORWARD);
    cudaThreadSynchronize();
    cudaMemcpy(c, data, sizeof(float2)*NX*NY, cudaMemcpyDeviceToHost);

    // Print the values of c  ---- ALL ARE 0
    for (int i = 0; i < NX; i++)
        {
        for (int j =0 ; j< NY; j++)
            {
            printf(" %f + %fi ",c[i][j].x,c[i][j].y);
            b
            }
        printf("\n");
        }


    cufftDestroy(plan);
    cudaFree(data);

    return 0;
    }

How could I solve this problem ?

EDIT

After considering Robert Crovella's suggestion I modified the code as

// Output array - host side
float2 c[NX][NY + 2] ;

// New device side variable that will hold the result from the FFT size - twice as input {2 x NX*(NY/2 + 1)}
cufftComplex *data_out;
cudaMalloc((void**)&data_out, sizeof(cufftComplex)*NX*(NY+2));

 /* Create a 2D FFT plan. */
cufftPlanMany(&plan, NRANK, n,NULL, 1, 0,NULL, 1, 0,CUFFT_C2C,BATCH);
cufftSetCompatibilityMode(plan, CUFFT_COMPATIBILITY_NATIVE);
cufftExecC2C(plan, data, data_out, CUFFT_FORWARD);
cudaThreadSynchronize();
cudaError  cudaStat2 = cudaMemcpy(c, data_out, sizeof(cufftComplex)*NX*(NY+2) , cudaMemcpyDeviceToHost);

cout << cudaGetErrorString(cudaStat2) << " ,\n";

for (int i = 0; i < NX; i++)
    {
    for (int j =0 ; j< NY; j++)
        {
        printf(" %f ,",c[i][j].x);

        }
    printf("\n");
    }

Now the output device matrix is 2 x sizeof(cufftComplex) NX (NY/2+1) and I have declared it as data_out. Then the host side matrix was also adjusted to hold NX*(NY+2) elements of float2. Now I don't get any errors from cudaMemcpy. But still I don't get the answer. What i get is an array of 1.#QNAN0 values.

So how can I solve this ?

Answer 1

The problem described in your question title was fixed by making the modification I described in the comments. After that, your code had other issues, not related to the copy back of results.

You're asking for a C2C transform of size NX*NY , but your input data size is only sizeof(cufftComplex)*NX*(NY/2+1) . When I fix a variety of issues around your input data and it's size, I start to get results that are not NAN's in your code.

Also, it's not clear to me why you are allocating a size of (NY+2) in various places. When I fix those errors I can get some kind of (non-NAN) result from your code:

$ cat t311.cu
#define NRANK 2
#define BATCH 10

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cufft.h>
#include <stdio.h>

#include <iostream>
#include <vector>

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)


using namespace std;


float func(int,int){
              return 2.0f;  // some value get return. I have put a dummy value here
}
int main()
    {

    const size_t NX = 4;
    const size_t NY = 5;

    // Input array - host side
    float **a = new float*[NX];

    for (int r = 0; r < NX; ++r)  // this can be also done on GPU
        {
        a[r] = new float[NY];
        for (int c = 0; c < NY; ++c)
            {
                a[r][c] = func(r,c);
            }
        }

    // Output array - host side
    float2 c[NX][NY] ;
    cufftHandle plan;

    cufftComplex *data;   // Input and output arrays - device side
    int n[NRANK] = {NX, NY};
    cudaMalloc((void**)&data, sizeof(cufftComplex)*NX*NY);
    cudaMemset(data,0, sizeof(cufftComplex)*NX*NY);
    for(int i=0; i<NX; ++i){
        cudaMemcpy(reinterpret_cast<float*>(data) + i*NY, a[i], sizeof(float)*NY,cudaMemcpyHostToDevice);
        cudaCheckErrors("cudaMemcpy H2D fail");
     }

  // New device side variable that will hold the result from the FFT size - twice as input {2 x NX*(NY/2 + 1)}
    cufftComplex *data_out;
    cudaMalloc((void**)&data_out, sizeof(cufftComplex)*NX*(NY));
    cudaCheckErrors("cudaMalloc data_out fail");
   /* Create a 2D FFT plan. */
    if ((cufftPlanMany(&plan, NRANK, n,NULL, 1, 0,NULL, 1, 0,CUFFT_C2C,BATCH)) != CUFFT_SUCCESS) printf("cufft fail 1\n");
    if ((cufftSetCompatibilityMode(plan, CUFFT_COMPATIBILITY_NATIVE)) != CUFFT_SUCCESS) printf("cufft fail 2\n");
    if ((cufftExecC2C(plan, data, data_out, CUFFT_FORWARD)) != CUFFT_SUCCESS) printf("cufft fail 3\n") ;
    cudaDeviceSynchronize();
    cudaMemcpy(c, data_out, sizeof(cufftComplex)*NX*(NY) , cudaMemcpyDeviceToHost);
    cudaCheckErrors("cudaMemcpy D2H fail");

    for (int i = 0; i < NX; i++)
      {
      for (int j =0 ; j< NY; j++)
        {
        printf(" %f ,",c[i][j].x);

        }
      printf("\n");
      }

    cufftDestroy(plan);
    cudaFree(data);
    cudaCheckErrors("some error");
    return 0;
    }

$ nvcc -arch=sm_20 -o t311 t311.cu -lcufft
$ ./t311
 20.000000 , 0.000000 , 0.000000 , 0.000000 , 0.000000 ,
 20.000000 , 0.000000 , 0.000000 , 0.000000 , 0.000000 ,
 0.000000 , 0.000000 , 0.000000 , 0.000000 , 0.000000 ,
 0.000000 , 0.000000 , 0.000000 , 0.000000 , 0.000000 ,
$

I'm not saying this fixes every possible issue or error this code might have, but the first two issues you've identified have been addressed.

I think the remaining problems stem from how you are populating the input data. You are putting an uneven number (NY = 5) of float values, laid over the top of a cufftComplex array. To me, that would give strange results. The first two complex values in each row (of data) will have real and complex components of 2. The third value will have a real component of 2, and an imaginary component of 0. The last two complex values will be all zero.

If you want to see one possible method to copy an array of float values into the real parts of an array of complex values with a single API call, consider cudaMemcpy2D , documented here and with a recent example here . That example actually shows how to copy from a structure array to a float array, but doing the reverse ( float array to structure array) uses a similar technique. Something like this should work:

for(int i=0; i<NX; ++i){
    cudaMemcpy2D(data + i*NY, sizeof(cufftComplex), a[i], sizeof(float), sizeof(float), NY, cudaMemcpyHostToDevice);
 }

If you have new questions/new problems, please post a new SO question to describe them.

How to copy a 2D array from Device to Host in Cuda?

Question

EDIT

1 answers

solution1
1 ACCPTED 2013-12-28 16:00:16

How to copy a 2D array from Device to Host in Cuda?

Question

EDIT

1 answers

solution1 1 ACCPTED 2013-12-28 16:00:16

solution1
1 ACCPTED 2013-12-28 16:00:16