Pass 2D thrust::device_vector Complex Matrix to CUDA kernel function

Question

I'm new in Cuda and and I'm trying to move my existing Project to GPU using Cuda. My code are based on complex matrices and complex buffers.

For the first step, I tried to move That nested For loop Code to Cuda (the rest will be similar):

     typedef thrust::complex<double> smp_t;

  uint8_t *binbuffer = (uint8_t*) malloc(8 * bufsize * sizeof(uint8_t));
  smp_t *sgbuf = (smp_t*) malloc(8 * bufsize * sizeof(smp_t));
  smp_t *cnbuf = (smp_t*) malloc(8 * bufsize * sizeof(smp_t));

 // Create matrix.
 thrust::complex<double> i_unit(0.0, 1.0);
 thrust::host_vector<thrust::host_vector<smp_t>> tw(decfactor);

  // Fill the Matrix
  for (size_t row = 0; row < 8; row++) {
       for (size_t col = 0; col < 8; col++) {
               std::complex<double> tmp =
                      exp(-i_unit * 2.0*M_PI * ((double) col*row) / (double)8);
              tw[row].push_back(tmp);
      }
  }

/* The Code To Move to the GPU processing */
for (unsigned int i = 0; i < bufsize; i++) {
        for (size_t ch = 0; ch < 8; ch++)
                for (size_t k = 0; k < 8; k++)
                        cnbuf[ch*bufsize + i] += sgbuf[k*bufsize+i] * tw[ch].at(k);
}

That is the Code from the.cu file that will replace the current nested for loop:

   __global__ void kernel_func(cuDoubleComplex *cnbuf, cuDoubleComplex *sgbuf, smp_t *tw, size_t block_size) {
    unsigned int ch = threadIdx.x;
    unsigned int k = blockIdx.x;

     for (int x = 0; x < block_size; ++x) {
            unsigned int sig_index = k*block_size+x;
            unsigned int tw_index = ch*k;
            unsigned int cn_index = ch*block_size+x;


            cuDoubleComplex temp = cuCmul(sgbuf[sig_index], make_cuDoubleComplex(tw[tw_index].real(), tw[tw_index].imag()));
            cnbuf[cn_index] = cuCadd(temp, cnbuf[cn_index]);
     }
}

void kernel_wrap(
            smp_t *cnbuf,
            smp_t *sgbuf,
            thrust::host_vector<thrust::host_vector<smp_t>>tw,
            size_t buffer_size) {
    smp_t *d_sgbuf;
    smp_t *d_cnbuf;
    thrust::device_vector<smp_t> d_tw(8*8);
    thrust::copy(&tw[0][0], &tw[7][7], d_tw.begin());

    cudaMalloc((void **)&d_sgbuf, buffer_size);
    cudaMalloc((void **)&d_cnbuf, buffer_size);

    cudaMemcpy(d_sgbuf, sgbuf, buffer_size, cudaMemcpyDeviceToHost);
    cudaMemcpy(d_cnbuf, cnbuf, buffer_size, cudaMemcpyDeviceToHost);

    thrust::raw_pointer_cast(d_tw.data());

    kernel_func<<<8, 8>>>(
   reinterpret_cast<cuDoubleComplex*>(d_cnbuf),
                    reinterpret_cast<cuDoubleComplex*>(d_sgbuf),
                    thrust::raw_pointer_cast(d_tw.data()),
                    buffer_size
    );

    cudaError_t varCudaError1 = cudaGetLastError();
    if (varCudaError1 != cudaSuccess)
    {
            std::cout << "Failed to launch subDelimiterExamine kernel (error code: " << cudaGetErrorString(varCudaError1) << ")!" << std::endl;
            exit(EXIT_FAILURE);
    }

    cudaMemcpy(sgbuf, d_sgbuf, buffer_size, cudaMemcpyHostToDevice);
    cudaMemcpy(cnbuf, d_cnbuf, buffer_size, cudaMemcpyHostToDevice);

}

When I'm running the code, I get the error:

Failed to launch subDelimiterExamine kernel (error code: invalid argument)!

I think that the argument that causing the troubles is the 'd_tw'. So, my questions are:

What am I'm doing wrong with the cast of <thrust::host_vector<thrust::host_vector smp_t>> to <thrust::device_vector smp_t>> (from 2d Matrix to one flattened arr)?
Is there a better whey to work with 2D Complex numbers in CUDA?
The documentation about Complex arrays in Cuda are very poorly, where can I read abound the work with Cuda Complex matrices?

Thanks!!!!

Answer 1

There were various problems. I will list a few, and probably miss some. So please refer to the example code I have given for additional differences.

The most immediate problem is here:

 thrust::copy(&tw[0][0], &tw[7][7], d_tw.begin());

This is what is giving rise to the invalid argument error you are seeing. Underneath the hood, thrust is going to try to use a cudaMemcpyAsync operation for this, because this is inherently a copy from host to device. We will fix this by replacing it with an ordinary cudaMemcpy operation, but to understand how to construct that, it's necessary to understand item 2.

You seem to think that a vector of vectors implies contiguous storage. It does not and that statement is not specific to thrust. Since a thrust::host_vector of vectors (or even std::vector of vectors) does not imply contiguous storage, we can't easily construct a single operation, such as cudaMemcpy or thrust::copy to copy this data. Therefore it will be necessary to explicitly flatten it.
Your directions of copy on the cudaMemcpy operations are universally backward. Where you should have had cudaMemcpyHostToDevice you had cudaMemcpyDeviceToHost , and vice-versa.
The CUDA cuComplex.h header file predates thrust, and was provided for a quick C-style method to work with complex numbers. There is no documentation for it - you have to read the file itself and work out how to use it, as seem to have already done. However, since you are using thrust::complex<> anyway, it's far simpler just to use that coding paradigm, and write you device code to look almost exactly like your host code.
You had various transfer sizes wrong. cudaMemcpy takes a size in bytes to transfer.

What follows is an example, cobbled together from the pieces you have shown, with a variety of "fixes". I'm not claiming its in any way perfect or correct, but it avoids the issues I have outlined above. Furthermore, depending on how you compile with or with a -DUSE_KERNEL define, it will either run your "original" host code and display the output, or the kernel code and display the output. According to my testing, the outputs match.

$ cat t1751.cu
#include <thrust/complex.h>
#include <thrust/copy.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <iostream>
#include <cstdint>
#include <cuComplex.h>

typedef thrust::complex<double> smp_t;
__global__ void kernel_func_old(cuDoubleComplex *cnbuf, cuDoubleComplex *sgbuf, smp_t *tw, size_t block_size) {
    unsigned int ch = threadIdx.x;
    unsigned int k = blockIdx.x;

     for (int x = 0; x < block_size; ++x) {
            unsigned int sig_index = k*block_size+x;
            unsigned int tw_index = ch*k;
            unsigned int cn_index = ch*block_size+x;


            cuDoubleComplex temp = cuCmul(sgbuf[sig_index], make_cuDoubleComplex(tw[tw_index].real(), tw[tw_index].imag()));
            cnbuf[cn_index] = cuCadd(temp, cnbuf[cn_index]);
     }
}
__global__ void kernel_func(smp_t *cnbuf, smp_t *sgbuf, smp_t *tw, size_t block_size) {
    unsigned row = blockIdx.x;
    unsigned col = threadIdx.x;
    unsigned idx = row*block_size+col;
    for (int k = 0; k < 8; k++)
      cnbuf[idx] += sgbuf[k*block_size+col] * tw[row*block_size+k];
}

void kernel_wrap(
            smp_t *cnbuf,
            smp_t *sgbuf,
            thrust::host_vector<thrust::host_vector<smp_t>>tw,
            size_t buffer_size) {
    smp_t *d_sgbuf;
    smp_t *d_cnbuf;
    thrust::device_vector<smp_t> d_tw(8*8);
//    thrust::copy(&tw[0][0], &tw[7][7], d_tw.begin());
    thrust::host_vector<smp_t> htw(buffer_size*buffer_size);
    for (int i = 0; i < buffer_size; i++)
      for (int j = 0; j < buffer_size; j++)
        htw[i*buffer_size + j] = tw[i][j];

    cudaMemcpy(thrust::raw_pointer_cast(d_tw.data()), &htw[0], 8*8*sizeof(smp_t), cudaMemcpyHostToDevice);
    cudaMalloc((void **)&d_sgbuf, buffer_size*buffer_size*sizeof(smp_t));
    cudaMalloc((void **)&d_cnbuf, buffer_size*buffer_size*sizeof(smp_t));

    cudaMemcpy(d_sgbuf, sgbuf, buffer_size*buffer_size*sizeof(smp_t), cudaMemcpyHostToDevice);
    cudaMemcpy(d_cnbuf, cnbuf, buffer_size*buffer_size*sizeof(smp_t), cudaMemcpyHostToDevice);

    thrust::raw_pointer_cast(d_tw.data());

    kernel_func<<<8, 8>>>(d_cnbuf,d_sgbuf,thrust::raw_pointer_cast(d_tw.data()),buffer_size);

    cudaError_t varCudaError1 = cudaGetLastError();
    if (varCudaError1 != cudaSuccess)
    {
            std::cout << "Failed to launch subDelimiterExamine kernel (error code: " << cudaGetErrorString(varCudaError1) << ")!" << std::endl;
            exit(EXIT_FAILURE);
    }

//    cudaMemcpy(sgbuf, d_sgbuf, buffer_size*buffer_size*sizeof(smp_t), cudaMemcpyDeviceToHost);
    cudaMemcpy(cnbuf, d_cnbuf, buffer_size*buffer_size*sizeof(smp_t), cudaMemcpyDeviceToHost);
    for (int i = 0; i < 8; i++)
      for (int j = 0; j < 8; j++)
        std::cout << cnbuf[i*8+j].real() << "," << cnbuf[i*8+j].imag() << std::endl;
}

int main(){
  const int bufsize = 8;
  const int decfactor = 8;

  uint8_t *binbuffer = (uint8_t*) malloc(8 * bufsize * sizeof(uint8_t));
  smp_t *sgbuf = (smp_t*) malloc(8 * bufsize * sizeof(smp_t));
  smp_t *cnbuf = (smp_t*) malloc(8 * bufsize * sizeof(smp_t));
  memset(cnbuf, 0, 8*bufsize*sizeof(smp_t));
 // Create matrix.
 thrust::complex<double> i_unit(0.0, 1.0);
#ifndef USE_KERNEL
 std::vector<std::vector<smp_t> > tw(decfactor);
#else
 thrust::host_vector<thrust::host_vector<smp_t>> tw(decfactor);
#endif

  // Fill the Matrix
  for (size_t row = 0; row < 8; row++) {
       for (size_t col = 0; col < 8; col++) {
              std::complex<double> tmp = exp(-i_unit * 2.0*M_PI * ((double) col*row) / (double)8);
              tw[row].push_back(tmp);
      }
  }
  thrust::complex<double> test(1.0, 1.0);
  for (int i = 0; i < 8*8; i++) sgbuf[i]  = test;
#ifndef USE_KERNEL
/* The Code To Move to the GPU processing */
for (unsigned int i = 0; i < bufsize; i++) {
        for (size_t ch = 0; ch < 8; ch++)
                for (size_t k = 0; k < 8; k++)
                        cnbuf[ch*bufsize + i] += sgbuf[k*bufsize+i] * tw[ch].at(k);
}
    for (int i = 0; i < 8; i++)
      for (int j = 0; j < 8; j++)
        std::cout << cnbuf[i*8+j].real() << "," << cnbuf[i*8+j].imag() << std::endl;
#else

  kernel_wrap(cnbuf,sgbuf,tw,bufsize);
#endif

}
$ nvcc -o t1751 t1751.cu -std=c++11
$ ./t1751 >out_host.txt
$ nvcc -o t1751 t1751.cu -std=c++11 -DUSE_KERNEL
$ ./t1751 >out_device.txt
$ diff out_host.txt out_device.txt
$

Remember, this is mostly your code, I am not claiming it is correct, or defect-free, or suitable for any particular purpose. Use it at your own risk.

Pass 2D thrust::device_vector Complex Matrix to CUDA kernel function

Question

1 answers

solution1
1 ACCPTED 2020-06-28 17:37:33

Pass 2D thrust::device_vector Complex Matrix to CUDA kernel function

Question

1 answers

solution1 1 ACCPTED 2020-06-28 17:37:33

solution1
1 ACCPTED 2020-06-28 17:37:33