Looping over 3 dimensional arrays in CUDA to sum their elements

Question

I'm having some problems understanding how to loop over 3 dimensional arrays with a kernel.

This is the code I have so far:

#include <iostream>
#include <ctime>

#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>

using namespace std;


int main()
{
// Array properties
const int width = 1;
const int height = 1;
const int depth = 1;

// Declaration of arrays
float h_A[width][height][depth];
float h_B[width][height][depth];
float h_C[width][height][depth] = {{{0}}};

// Fill up arrays
srand(time(0));
for(int i = 0; i < width; i++){
    for(int j = 0; j < height; j++){
        for(int z = 0; z < depth; z++){
            h_A[i][j][z] = rand()%1000;
            h_B[i][j][z] = rand()%1000;
        }
    }
}

// Declaration of device pointers
cudaPitchedPtr d_A, d_B, d_C;

// Allocating memory in GPU
cudaExtent extent = make_cudaExtent(width*sizeof(float),height,depth);
cudaMalloc3D(&d_A, extent);
cudaMalloc3D(&d_B, extent);
cudaMalloc3D(&d_C, extent);

// Copying memory from host to device
cudaMemcpy3DParms p;
p.srcPtr = make_cudaPitchedPtr(&h_A, sizeof(float)*width, height, depth);
p.extent = extent;
p.kind = cudaMemcpyHostToDevice;

p.dstPtr = d_A;
cudaMemcpy3D(&p);
p.dstPtr = d_B;
cudaMemcpy3D(&p);
p.dstPtr = d_C;
cudaMemcpy3D(&p);

system("pause");
return 0;
}

How do I make a kernel that loops over each element in the arrays and adds them together?

Answer 1

There is an example on page 21 of the CUDA 4.0 programming guide for looping over 2D array of floats:

// Host code
int width = 64, height = 64;
float* devPtr;
size_t pitch;
cudaMallocPitch(&devPtr, &pitch,
width * sizeof(float), height);
MyKernel<<<100, 512>>>(devPtr, pitch, width, height);


// Device code
__global__ void MyKernel(float* devPtr, size_t pitch, int width, int height)
{
   for (int r = 0; r < height; ++r) 
    {
       float* row = (float*)((char*)devPtr + r * pitch);
          for (int c = 0; c < width; ++c) 
              {
              float element = row[c];
              }
     }
}

rewrite it to sum up elements should be easy. Additionally you can refer to this thread. When efficiency is concern, you might also look on parallel reduction approach in CUDA. This is used for example when implementing Monte Carlo simulation (see Multi Monte Carlo example).

Looping over 3 dimensional arrays in CUDA to sum their elements

Question

1 answers

solution1
2 ACCPTED 2013-09-15 18:39:23

Looping over 3 dimensional arrays in CUDA to sum their elements

Question

1 answers

solution1 2 ACCPTED 2013-09-15 18:39:23

solution1
2 ACCPTED 2013-09-15 18:39:23