Real scaled Sparse matrix vector multiplication in Cusp?

Question

In cusp, there is a multiply to calculate spmv(sparse matrix vector multiplication) that takes a reduce and a combine:

 template <typename LinearOperator,
             typename MatrixOrVector1,
             typename MatrixOrVector2,
             typename UnaryFunction,
             typename BinaryFunction1,
             typename BinaryFunction2>
    void multiply(const LinearOperator&  A,
                  const MatrixOrVector1& B,
                  MatrixOrVector2& C,
                  UnaryFunction  initialize,
                  BinaryFunction1 combine,
                  BinaryFunction2 reduce);

From the interface it seems like custom combine and reduce should be possible for any matrix/vector multiplication. I think cusp supports to use other combine and reduce function defined in thrust/functional.h besides multiplication and plus to calculate spmv. For example, can I use thrust::plus to replace multiplication the original combine function(ie multiplication)? And I guess, this scaled spmv also support those sparse matrix in coo,csr,dia,hyb format.

However, I got a wrong answer when I tested the below example in a.cu whose matrix A was in coo format. It used plus operator to combine. And I compiled it with cmd : nvcc a.cu -oa to .

#include <cusp/csr_matrix.h>
#include <cusp/monitor.h>
#include <cusp/multiply.h>
#include <cusp/print.h>
#include <cusp/krylov/cg.h>

int main(void)
{
    // COO format in host memory
    int   host_I[13] = {0,0,1,1,2,2,2,3,3,3,4,5,5}; // COO row indices
    int   host_J[13] = {0,1,1,2,2,4,6,3,4,5,5,5,6}; // COO column indices
    int   host_V[13] = {1,1,1,1,1,1,1,1,1,1,1,1,1};
    // x and y arrays in host memory
    int host_x[7] = {1,1,1,1,1,1,1};
    int host_y[6] = {0,0,0,0,0,0};

    // allocate device memory for COO format
    int   * device_I;
    cudaMalloc(&device_I, 13 * sizeof(int));
    int   * device_J;
    cudaMalloc(&device_J, 13 * sizeof(int));
    int * device_V;
    cudaMalloc(&device_V, 13 * sizeof(int));

    // allocate device memory for x and y arrays
    int * device_x;
    cudaMalloc(&device_x, 7 * sizeof(int));
    int * device_y;
    cudaMalloc(&device_y, 6 * sizeof(int));

    // copy raw data from host to device
    cudaMemcpy(device_I, host_I, 13 * sizeof(int),   cudaMemcpyHostToDevice);
    cudaMemcpy(device_J, host_J, 13 * sizeof(int),   cudaMemcpyHostToDevice);
    cudaMemcpy(device_V, host_V, 13 * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(device_x, host_x,  7 * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(device_y, host_y,  6 * sizeof(int), cudaMemcpyHostToDevice);

    // matrices and vectors now reside on the device

    // *NOTE* raw pointers must be wrapped with thrust::device_ptr!
    thrust::device_ptr<int>   wrapped_device_I(device_I);
    thrust::device_ptr<int>   wrapped_device_J(device_J);
    thrust::device_ptr<int>   wrapped_device_V(device_V);
    thrust::device_ptr<int>   wrapped_device_x(device_x);
    thrust::device_ptr<int>   wrapped_device_y(device_y);

    // use array1d_view to wrap the individual arrays
    typedef typename cusp::array1d_view< thrust::device_ptr<int>   > DeviceIndexArrayView;
    typedef typename cusp::array1d_view< thrust::device_ptr<int> > DeviceValueArrayView;

    DeviceIndexArrayView row_indices   (wrapped_device_I, wrapped_device_I + 13);
    DeviceIndexArrayView column_indices(wrapped_device_J, wrapped_device_J + 13);
    DeviceValueArrayView values        (wrapped_device_V, wrapped_device_V + 13);
    DeviceValueArrayView x             (wrapped_device_x, wrapped_device_x + 7);
    DeviceValueArrayView y             (wrapped_device_y, wrapped_device_y + 6);

    // combine the three array1d_views into a coo_matrix_view
    typedef cusp::coo_matrix_view<DeviceIndexArrayView,
            DeviceIndexArrayView,
            DeviceValueArrayView> DeviceView;

    // construct a coo_matrix_view from the array1d_views
    DeviceView A(6, 7, 13, row_indices, column_indices, values);

    std::cout << "\ndevice coo_matrix_view" << std::endl;
    cusp::print(A);
    cusp::constant_functor<int> initialize;
    thrust::plus<int> combine;
    thrust::plus<int> reduce;
    cusp::multiply(A , x , y , initialize, combine, reduce);
    std::cout << "\nx array" << std::endl;
    cusp::print(x);
    std::cout << "\n y array, y = A * x" << std::endl;
    cusp::print(y);

    cudaMemcpy(host_y, device_y,  6 * sizeof(int), cudaMemcpyDeviceToHost);

    // free device arrays
    cudaFree(device_I);
    cudaFree(device_J);
    cudaFree(device_V);
    cudaFree(device_x);
    cudaFree(device_y);

    return 0;
}

And I got the below answer.

device coo_matrix_view
sparse matrix <6, 7> with 13 entries
              0              0        (1)
              0              1        (1)
              1              1        (1)
              1              2        (1)
              2              2        (1)
              2              4        (1)
              2              6        (1)
              3              3        (1)
              3              4        (1)
              3              5        (1)
              4              5        (1)
              5              5        (1)
              5              6        (1)
x array
array1d <7>

        (1)
        (1)
        (1)
        (1)
        (1)
        (1)
        (1)
 y array, y = A * x
array1d <6>
        (4)
        (4)
        (6)
        (6)
        (2)
        (631)

The vector y I got is strange, I think the correct answer y should be:

[9,
9,
10,
10,
8,
9]

So I do not sure that whether such replacement of combine and reduce can be adapted to other sparse matrix format, like coo. Or maybe the code I wrote above is incorrect to call multiply. Can you give me some help? Any info will help.

Thank you!

Answer 1

From a very brief reading of the code and instrumentation of your example, this seems to be something badly broken in CUSP causing the problem for this usage case. The code only appears to accidentally work correctly for the case where the combine operator is multiplication because the spurious operations it performs with zero elements do not effect the reduction operation (ie. it just sums a lot of additional zeros).

Real scaled Sparse matrix vector multiplication in Cusp?

Question

1 answers

solution1
1

Real scaled Sparse matrix vector multiplication in Cusp?

Question

1 answers

solution1 1

solution1
1