How to copy structured data allocated in device memory from device to host

Question

I'm new to GPU and CUDA programming. I am trying to copy structured data dynamically allocated on device from device to host. I modified a simple code from GPU programming guide. I don't get any error when compiling the code but the only thing I have that's problematic is the output is wrong ie '0'. Here's the code:

#include <stdlib.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>

typedef struct Point
{
    int2 pt;
};

#define NUMOFBLOCKS 1
#define THREDSPERBLOCK  16

__device__ Point* pnt[NUMOFBLOCKS];
Point dataptr_h[NUMOFBLOCKS][THREDSPERBLOCK];

__global__ void allocmem() 
{   
    if (threadIdx.x == 0)       
        pnt[blockIdx.x] = (Point*)malloc(1*blockDim.x * sizeof(Point)); 
    __syncthreads(); 
} 

__global__ void usemem() 
{ 
    Point* ptr = pnt[blockIdx.x]; 
    if (ptr != NULL) 
    {       
        ptr[threadIdx.x].pt.x = threadIdx.x; 
        ptr[threadIdx.x].pt.y = threadIdx.x;
        printf("Ptr = %d\t", ptr[threadIdx.x].pt.x);
    }
}

__global__ void freemem() 
{ 
    Point* ptr = pnt[blockIdx.x]; 
    if (ptr != NULL) 
        printf("Block %d, Thread %d: final value = %d\n", blockIdx.x, threadIdx.x, ptr[threadIdx.x]); 
    if (threadIdx.x == 0) 
        free(ptr); 
}


int main()
{
    Point* d_pt[NUMOFBLOCKS];
    for (int i = 0 ; i < NUMOFBLOCKS; i++)
        cudaMalloc(&d_pt[i], sizeof(Point)*16);  

    // Allocate memory  
    allocmem<<< NUMOFBLOCKS, THREDSPERBLOCK >>>();  
    // Use memory 
    usemem<<< NUMOFBLOCKS, THREDSPERBLOCK >>>(); 
    cudaMemcpyFromSymbol(d_pt, pnt, sizeof(d_pt));
    cudaMemcpy(dataptr_h, d_pt, sizeof(dataptr_h), cudaMemcpyDeviceToHost);

    for (int j = 0 ; j < 1; j++)
        for (int i = 0 ; i < 16; i++)
        {
            printf("\nPtr_h(%d,%d)->X = %d\t", j, i, dataptr_h[j][i].pt.x);
            printf("Ptr_h(%d,%d)->Y = %d", j, i, dataptr_h[j][i].pt.y);
        }

    freemem<<< NUMOFBLOCKS, THREDSPERBLOCK >>>();
    cudaDeviceSynchronize();
    return 0;
}

The output of the code is:

Ptr_h(0,0)->X = 0       Ptr_h(0,0)->Y = 0
Ptr_h(0,1)->X = 0       Ptr_h(0,1)->Y = 0
Ptr_h(0,2)->X = 0       Ptr_h(0,2)->Y = 0
Ptr_h(0,3)->X = 0       Ptr_h(0,3)->Y = 0
Ptr_h(0,4)->X = 0       Ptr_h(0,4)->Y = 0
Ptr_h(0,5)->X = 0       Ptr_h(0,5)->Y = 0
Ptr_h(0,6)->X = 0       Ptr_h(0,6)->Y = 0
Ptr_h(0,7)->X = 0       Ptr_h(0,7)->Y = 0
Ptr_h(0,8)->X = 0       Ptr_h(0,8)->Y = 0
Ptr_h(0,9)->X = 0       Ptr_h(0,9)->Y = 0
Ptr_h(0,10)->X = 0      Ptr_h(0,10)->Y = 0
Ptr_h(0,11)->X = 0      Ptr_h(0,11)->Y = 0
Ptr_h(0,12)->X = 0      Ptr_h(0,12)->Y = 0
Ptr_h(0,13)->X = 0      Ptr_h(0,13)->Y = 0
Ptr_h(0,14)->X = 0      Ptr_h(0,14)->Y = 0
Ptr_h(0,15)->X = 0      Ptr_h(0,15)->Y = 0

What can I do to fix this?

Answer 1

You cannot use a pointer created by a device malloc operation with the CUDA runtime API (ie cudaMemcpy )

So this line of code is problematic:

cudaMemcpy(dataptr_h, d_pt, sizeof(dataptr_h), cudaMemcpyDeviceToHost);

d_pt contains pointers picked up from pnt . And pnt had it's value(s) set by device malloc .

Instead you'll need to create areas properly allocated with cudaMalloc , then copy the data you want to those areas first (from one area on the device to another area), then copy to the host using cudaMemcpy .

Before I go farther to explain your next objection, let's be clear that the above is your intent (to use a pointer created in a device malloc operation as one of the targets of a cudaMemcpy ). This is not legal.

"But I used cudaMalloc ??"

d_pt is an array of pointers that live in host memory. You took each one of those pointers and assigned it a value (a pointed-to location in device memory) using cudaMalloc .

Then this line of code:

cudaMemcpyFromSymbol(d_pt, pnt, sizeof(d_pt));

over-wrote all those pointers that you set up with pointers obtained from elsewhere in device memory, specifically pointers assigned by device malloc . While this is technically legal (that line of code does not throw an error) those pointers are useless on the host (for use with the runtime API, anyway).

How to copy structured data allocated in device memory from device to host

Question

1 answers

solution1
1 ACCPTED 2013-10-22 02:44:33

How to copy structured data allocated in device memory from device to host

Question

1 answers

solution1 1 ACCPTED 2013-10-22 02:44:33

solution1
1 ACCPTED 2013-10-22 02:44:33