Matrix-vector multiplication (cublasDgemv) returns zero

Question

For my first venture into CUDA/cuBLAS, I'm trying to write a simple function that multiplies an MxN matrix (represented with vector-of-vectors, std::vector ) with an Nx1 "ones" vector, so as to get rowwise(?) sum of the matrix. This will make use of cublas_gemv() plus other basic CUDA operations, which I see as a good place to start.

After dealing with setup issues and reading/copying sample codes, here's what I have:

std::vector<double> test(std::vector<std::vector<double>> in)
{
    std::vector<double> out;
    long in_m = in.size();
    long in_n = in[0].size();
    cudaError_t cudaStat;
    cublasStatus_t stat;
    cublasHandle_t handle;
    // This just converts a vector-of-vectors into a col-first array
    double* p_in = vec2d_to_colfirst_array(in);
    double* p_ones = new double[in_n];
    double* p_out = new double[in_m];
    std::fill(p_ones, p_ones + in_n, 1.0);
    double* dev_in;
    double* dev_ones;
    double* dev_out;
    cudaStat = cudaMalloc((void**)&dev_in, in_m * in_n * sizeof(double));
    cudaStat = cudaMalloc((void**)&dev_ones, in_n * sizeof(double));
    cudaStat = cudaMalloc((void**)&dev_out, in_m * sizeof(double));
    stat = cublasCreate(&handle);
    cudaStat = cudaMemcpy(dev_in, p_in, in_m*in_n * sizeof(double), cudaMemcpyHostToDevice);
    cudaStat = cudaMemcpy(dev_ones, p_ones, in_n * sizeof(double), cudaMemcpyHostToDevice);
    double alpha = 1.0;
    double beta = 0.0;
    stat = cublasDgemv(handle, CUBLAS_OP_N, in_m, in_n, &alpha, dev_in, in_m, dev_ones, 1, &beta, dev_ones, 1);
    cudaStat = cudaMemcpy(p_out, dev_out, in_m * sizeof(double), cudaMemcpyDeviceToHost);
    out.assign(p_out, p_out + in_m);
    cudaFree(dev_in);
    cudaFree(dev_ones);
    cudaFree(dev_out);
    cublasDestroy(handle);
    free(p_in);
    free(p_ones);
    free(p_out);
    return out;
}

It doesn't look much different from the sample I read, so I expected it to "just work". However, when I inspected p_out , it's all zeros. Surely I didn't input a zero in matrix.

I verified that vec2d_to_colfirst_array() does its job just fine, and also that dev_in / dev_ones are properly populated by copying the data from device back to host and then reading. Maybe the problem is within the call to cublasDgemv() , but since I'm new (and also since the BLAS grammar is much less unintuitive compared to eg Eigen), after much frustration I just can't see what's wrong.

Any help appreciated!

Answer 1

The error appears to be fairly simple. You are expecting to copy the results from dev_out :

cudaStat = cudaMemcpy(p_out, dev_out, in_m * sizeof(double), cudaMemcpyDeviceToHost);

but you never use dev_out in your cublas call:

stat = cublasDgemv(handle, CUBLAS_OP_N, in_m, in_n, &alpha, dev_in, in_m, dev_ones, 1, &beta, dev_ones, 1);

This appears to be just a copy-paste error. If you replace the last instance of dev_ones in your cublas call with dev_out , your code works for me:

stat = cublasDgemv(handle, CUBLAS_OP_N, in_m, in_n, &alpha, dev_in, in_m, dev_ones, 1, &beta, dev_out, 1);

Here is a fully worked example with that change:

$ cat t315.cu
#include <vector>
#include <cublas_v2.h>
#include <iostream>

const long idim1 = 8;
const long idim2 = 8;

double* vec2d_to_colfirst_array(std::vector<std::vector<double>> in){
    long dim1 = in.size();
    long dim2 = in[0].size();
    long k = 0;
    double *res = new double[dim1*dim2];
    for (int i = 0; i < dim1; i++)
      for (int j = 0; j < dim2; j++) res[k++] = in[i][j];
    return res;
}


std::vector<double> test(std::vector<std::vector<double>> in)
{
    std::vector<double> out;
    long in_m = in.size();
    long in_n = in[0].size();
    cudaError_t cudaStat;
    cublasStatus_t stat;
    cublasHandle_t handle;
    // This just converts a vector-of-vectors into a col-first array
    double* p_in = vec2d_to_colfirst_array(in);
    double* p_ones = new double[in_n];
    double* p_out = new double[in_m];
    std::fill(p_ones, p_ones + in_n, 1.0);
    double* dev_in;
    double* dev_ones;
    double* dev_out;
    cudaStat = cudaMalloc((void**)&dev_in, in_m * in_n * sizeof(double));
    cudaStat = cudaMalloc((void**)&dev_ones, in_n * sizeof(double));
    cudaStat = cudaMalloc((void**)&dev_out, in_m * sizeof(double));
    stat = cublasCreate(&handle);
    cudaStat = cudaMemcpy(dev_in, p_in, in_m*in_n * sizeof(double), cudaMemcpyHostToDevice);
    cudaStat = cudaMemcpy(dev_ones, p_ones, in_n * sizeof(double), cudaMemcpyHostToDevice);
    double alpha = 1.0;
    double beta = 0.0;
    stat = cublasDgemv(handle, CUBLAS_OP_N, in_m, in_n, &alpha, dev_in, in_m, dev_ones, 1, &beta, dev_out, 1);
    cudaStat = cudaMemcpy(p_out, dev_out, in_m * sizeof(double), cudaMemcpyDeviceToHost);
    out.assign(p_out, p_out + in_m);
    cudaFree(dev_in);
    cudaFree(dev_ones);
    cudaFree(dev_out);
    cublasDestroy(handle);

    free(p_in);
    free(p_ones);
    free(p_out);
    return out;
}

int main(){

  std::vector<double> a(idim2, 1.0);
  std::vector<std::vector<double>> b;
  for (int i = 0; i <  idim1; i++) b.push_back(a);
  std::vector<double> c = test(b);
  for (int i = 0; i < c.size(); i++) std::cout << c[i] << ",";
  std::cout << std::endl;
}

$ nvcc -std=c++11 -o t315 t315.cu -lcublas
t315.cu(24): warning: variable "cudaStat" was set but never used

t315.cu(25): warning: variable "stat" was set but never used

$ cuda-memcheck ./t315
========= CUDA-MEMCHECK
8,8,8,8,8,8,8,8,
========= ERROR SUMMARY: 0 errors
$

Note that I don't think free() is the correct API to use with new but that doesn't seem to be the crux of your question or issue.

Matrix-vector multiplication (cublasDgemv) returns zero

Question

1 answers

solution1
2 ACCPTED 2018-11-01 13:54:04

Matrix-vector multiplication (cublasDgemv) returns zero

Question

1 answers

solution1 2 ACCPTED 2018-11-01 13:54:04

solution1
2 ACCPTED 2018-11-01 13:54:04