How performing multiple matrix multiplications in CUDA?

Question

I have an array of square matrices int *M[10]; so that M[i] locates the first element of the i -th matrix. I want to multiply all the matrices M[i] by another matrix N , so that I receive an array of square matrices int *P[10] as output.

There are different possibilities I see:

Assing the computation of a different element of M[i] to a different thread; for example, I have 10 matrices, 4x4 sized, so that the number of involved threads would be 160 ; how to use CUDA to implement this approach?
In the framework of the example above, creating a composite matrix size 40x40 (ie, collecting 10 , 4x4 sized matrices together) and use 40x40 threads; but this approach seems to require more time; I'm trying with the array of matrices, but I think I'm doing something wrong; how can I use this approach with 10 matrices? How to code it in Kernel function?

This is what I'm trying;

void GPU_Multi(int *M[2], int *N, int *P[2], size_t width)
{

    int *devM[2];
    int *devN[2];
    int *devP[2];
    size_t allocasize =sizeof(int) *width*width;

    for(int i = 0 ; i < 10 ; i ++ ) 
    {
        cudaMalloc((void**)&devM[ i ], allocasize );
        cudaMalloc((void**)&devP[ i ], allocasize ); 
    }

    cudaMalloc((void**)&devN, allocasize );

    for(int i = 0 ; i < 10 ; i ++ ) {

        cudaMemcpy(devM[ i ],M[ i ], allocasize , cudaMemcpyHostToDevice);
        cudaMemcpy(devN, N, allocasize , cudaMemcpyHostToDevice);
        dim3 block(width*2, width*2);
        dim3 grid(1,1,1);
        Kernel_Function<<<grid, block>>>  (devM[2], devN, devP[2],width);

        for(int i = 0 ; i < 10 ; i ++ ) 
        {
            cudaMemcpy(P[ i ], P[ i ], allocatesize, cudaMemcpyDeviceToHost);
            cudaFree(devM[ i ]);   
            cudaFree(devP[ i ]);
        }

    }

Answer 1

I think it's likely that the fastest performance will be achieved by using the CUBLAS batch gemm function which was specifically designed for this purpose (performing a large number of "relatively small" matrix-matrix multiply operations).

Even though you want to multiply your array of matrices ( M[] ) by a single matrix ( N ), the batch gemm function will require you to pass also an array of matrices for N (ie N[] ), which will all be the same in your case.

EDIT: Now that I have worked thru an example, it seems clear to me that with a modification to the example below, we can pass a single N matrix and have the GPU_Multi function simply send the single N matrix to the device, while passing an array of pointers for N , ie d_Narray in the example below, with all of the pointers pointing to the same N matrix on the device.

Here is a fully worked batch GEMM example:

#include <stdio.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <assert.h>

#define ROWM 4
#define COLM 3
#define COLN 5

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)


typedef float mytype;
// Pi = Mi x Ni
// pr = P rows = M rows
// pc = P cols = N cols
// mc = M cols = N rows
void GPU_Multi(mytype **M, mytype **N, mytype **P
  , size_t pr, size_t pc, size_t mc
  , size_t num_mat, mytype alpha, mytype beta)
{

    mytype *devM[num_mat];
    mytype *devN[num_mat];
    mytype *devP[num_mat];
    size_t p_size =sizeof(mytype) *pr*pc;
    size_t m_size =sizeof(mytype) *pr*mc;
    size_t n_size =sizeof(mytype) *mc*pc;
    const mytype **d_Marray, **d_Narray;
    mytype **d_Parray;
    cublasHandle_t myhandle;
    cublasStatus_t cublas_result;

    for(int i = 0 ; i < num_mat; i ++ )
    {
        cudaMalloc((void**)&devM[ i ], m_size );
        cudaMalloc((void**)&devN[ i ], n_size );
        cudaMalloc((void**)&devP[ i ], p_size );
    }
    cudaMalloc((void**)&d_Marray, num_mat*sizeof(mytype *));
    cudaMalloc((void**)&d_Narray, num_mat*sizeof(mytype *));
    cudaMalloc((void**)&d_Parray, num_mat*sizeof(mytype *));
    cudaCheckErrors("cudaMalloc fail");
    for(int i = 0 ; i < num_mat; i ++ ) {

        cudaMemcpy(devM[i], M[i], m_size , cudaMemcpyHostToDevice);
        cudaMemcpy(devN[i], N[i], n_size , cudaMemcpyHostToDevice);
        cudaMemcpy(devP[i], P[i], p_size , cudaMemcpyHostToDevice);
    }
    cudaMemcpy(d_Marray, devM, num_mat*sizeof(mytype *), cudaMemcpyHostToDevice);
    cudaMemcpy(d_Narray, devN, num_mat*sizeof(mytype *), cudaMemcpyHostToDevice);
    cudaMemcpy(d_Parray, devP, num_mat*sizeof(mytype *), cudaMemcpyHostToDevice);
    cudaCheckErrors("cudaMemcpy H2D fail");
    cublas_result = cublasCreate(&myhandle);
    assert(cublas_result == CUBLAS_STATUS_SUCCESS);
    // change to    cublasDgemmBatched for double
    cublas_result = cublasSgemmBatched(myhandle, CUBLAS_OP_N, CUBLAS_OP_N
      , pr, pc, mc
      , &alpha, d_Marray, pr, d_Narray, mc
      , &beta, d_Parray, pr
      , num_mat);
    assert(cublas_result == CUBLAS_STATUS_SUCCESS);

    for(int i = 0 ; i < num_mat ; i ++ )
    {
        cudaMemcpy(P[i], devP[i], p_size, cudaMemcpyDeviceToHost);
        cudaFree(devM[i]);
        cudaFree(devN[i]);
        cudaFree(devP[i]);
    }
    cudaFree(d_Marray);
    cudaFree(d_Narray);
    cudaFree(d_Parray);
    cudaCheckErrors("cudaMemcpy D2H fail");

}

int main(){

  mytype h_M1[ROWM][COLM], h_M2[ROWM][COLM];
  mytype h_N1[COLM][COLN], h_N2[COLM][COLN];
  mytype h_P1[ROWM][COLN], h_P2[ROWM][COLN];
  mytype *h_Marray[2], *h_Narray[2], *h_Parray[2];
  for (int i = 0; i < ROWM; i++)
    for (int j = 0; j < COLM; j++){
      h_M1[i][j] = 1.0f; h_M2[i][j] = 2.0f;}
  for (int i = 0; i < COLM; i++)
    for (int j = 0; j < COLN; j++){
      h_N1[i][j] = 1.0f; h_N2[i][j] = 1.0f;}
  for (int i = 0; i < ROWM; i++)
    for (int j = 0; j < COLN; j++){
      h_P1[i][j] = 0.0f; h_P2[i][j] = 0.0f;}

  h_Marray[0] = &(h_M1[0][0]);
  h_Marray[1] = &(h_M2[0][0]);
  h_Narray[0] = &(h_N1[0][0]);
  h_Narray[1] = &(h_N2[0][0]);
  h_Parray[0] = &(h_P1[0][0]);
  h_Parray[1] = &(h_P2[0][0]);

  GPU_Multi(h_Marray, h_Narray, h_Parray, ROWM, COLN, COLM, 2, 1.0f, 0.0f);
  for (int i = 0; i < ROWM; i++)
    for (int j = 0; j < COLN; j++){
      if (h_P1[i][j] != COLM*1.0f)
      {
        printf("h_P1 mismatch at %d,%d was: %f should be: %f\n"
          , i, j, h_P1[i][j], COLM*1.0f); return 1;
      }
      if (h_P2[i][j] != COLM*2.0f)
      {
        printf("h_P2 mismatch at %d,%d was: %f should be: %f\n"
          , i, j, h_P2[i][j], COLM*2.0f); return 1;
      }
    }
  printf("Success!\n");
  return 0;
}

Answer 2

As it emerged from the comments above and the answer by Robert Crovella, there are different possible approaches. Each of the approach can be better suited for a different situation, ie, for a different number N of matrices to multiply and for different matrix dimensions MxM . Let me summarize them below:

If N is small and M is large, perhaps the best approach would be to use cublas<t>gemm called from host code;
If N is moderate and M is moderate, and if a device with compute capability of at least 3.5 is available, then a good possibility would be to use dynamic parallelism, namely, creating a thread grid of N threads and launching a cublas<t>gemm from within a kernel; perhaps this approach would fail for large N or M due to the large number of threads required;
If N is large and M is small, then the cuBLAS batched approach linked to by Robert Crovella could be of interest;
Similarly, if N is large and M is small, then a cuBLAS stream-based approach would be worth a try, as also mentioned in Robert's comment;
If N is large and M is very small, an approach using a thread grid of N threads, each "manually" calculating an optimized matrix multiplication could be appealing; for example, if one has to construct a matrix multiplication algorithm for 4x4 matrices, then one could optimize the matrix multiplication performed by each thread according to Number of elementary multiplications for multiplying 4x4 matrices .

Answer 3

In case data is stored in host memory in row major order and we wish to perform matrix multiplication and retrieve the data back in row major order, the below code does that

#include <stdio.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <assert.h>

#define ROWM 4
#define COLM 3
#define COLN 5

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

void printArrayS(float *ptr, int rows, int cols, char mode, char *name)
{
    printf("%s\n", name);
    for (int i = 0; i < rows; i++)
    {
        for (int j = 0; j < cols; j++)
        {
            if (mode == 'N') /* Normal mode */
            {
                if (ptr[i * cols + j] >= 0)
                    printf(" %3.6f ", ptr[i * cols + j]);
                else
                    printf("%3.6f ", ptr[i * cols + j]);
            }
            else /* Transpose mode */
            {
                if (ptr[j * rows + i] >= 0)
                    printf("%3.6f ", ptr[j * rows + i]);
                else
                    printf("%3.6f ", ptr[j * rows + i]);
            }
        }
        printf("\n");
    }
}

typedef float mytype;
// Pi = Mi x Ni
// pr = P rows = M rows
// pc = P cols = N cols
// mc = M cols = N rows
void GPU_Multi(mytype **M, mytype **N, mytype **P,
               size_t pr, size_t pc, size_t mc,
               size_t num_mat, mytype alpha, mytype beta)
{
    #define NUM_MAT   2

    mytype *devM[NUM_MAT];
    mytype *devN[NUM_MAT];
    mytype *devP[NUM_MAT];
    size_t p_size = sizeof(mytype) * pr * pc;
    size_t m_size = sizeof(mytype) * pr * mc;
    size_t n_size = sizeof(mytype) * mc * pc;
    const mytype **d_Marray, **d_Narray;
    mytype **d_Parray;
    cublasHandle_t myhandle;
    cublasStatus_t cublas_result;

    for (int i = 0; i < NUM_MAT; i++)
    {
        cudaMalloc((void **)&devM[i], m_size);
        cudaMalloc((void **)&devN[i], n_size);
        cudaMalloc((void **)&devP[i], p_size);
    }
    cudaMalloc((void **)&d_Marray, NUM_MAT * sizeof(mytype *));
    cudaMalloc((void **)&d_Narray, NUM_MAT * sizeof(mytype *));
    cudaMalloc((void **)&d_Parray, NUM_MAT * sizeof(mytype *));
    cudaCheckErrors("cudaMalloc fail");
    for (int i = 0; i < NUM_MAT; i++) {

        cudaMemcpy(devM[i], M[i], m_size, cudaMemcpyHostToDevice);
        cudaMemcpy(devN[i], N[i], n_size, cudaMemcpyHostToDevice);
        cudaMemcpy(devP[i], P[i], p_size, cudaMemcpyHostToDevice);
    }
    cudaMemcpy(d_Marray, devM, NUM_MAT * sizeof(mytype *), cudaMemcpyHostToDevice);
    cudaMemcpy(d_Narray, devN, NUM_MAT * sizeof(mytype *), cudaMemcpyHostToDevice);
    cudaMemcpy(d_Parray, devP, NUM_MAT * sizeof(mytype *), cudaMemcpyHostToDevice);
    cudaCheckErrors("cudaMemcpy H2D fail");
    cublas_result = cublasCreate(&myhandle);
    assert(cublas_result == CUBLAS_STATUS_SUCCESS);
    // change to    cublasDgemmBatched for double
    cublas_result = cublasSgemmBatched(myhandle, CUBLAS_OP_N, CUBLAS_OP_N
                                       , pc, pr, mc
                                       , &alpha, d_Narray, pc, d_Marray, mc
                                       , &beta, d_Parray, pc
                                       , NUM_MAT);
    assert(cublas_result == CUBLAS_STATUS_SUCCESS);

    for (int i = 0; i < NUM_MAT; i++)
    {
        cudaMemcpy(P[i], devP[i], p_size, cudaMemcpyDeviceToHost);
        cudaFree(devM[i]);
        cudaFree(devN[i]);
        cudaFree(devP[i]);
    }
    cudaFree(d_Marray);
    cudaFree(d_Narray);
    cudaFree(d_Parray);
    cudaCheckErrors("cudaMemcpy D2H fail");

}

int main() {

    mytype h_M1[ROWM][COLM], h_M2[ROWM][COLM];
    mytype h_N1[COLM][COLN], h_N2[COLM][COLN];
    mytype h_P1[ROWM][COLN], h_P2[ROWM][COLN];
    mytype *h_Marray[2], *h_Narray[2], *h_Parray[2];
    for (int i = 0; i < ROWM; i++)
        for (int j = 0; j < COLM; j++) {
            h_M1[i][j] = (i + j) * 1.0f; h_M2[i][j] = (i - j) * 2.0f;
        }
    for (int i = 0; i < COLM; i++)
        for (int j = 0; j < COLN; j++) {
            h_N1[i][j] = (i + j) * 1.0f; h_N2[i][j] = (i - j) * 1.0f;
        }
    for (int i = 0; i < ROWM; i++)
        for (int j = 0; j < COLN; j++) {
            h_P1[i][j] = 0.0f; h_P2[i][j] = 0.0f;
        }

    printArrayS((float *)h_M1, ROWM, COLM, 'N', "h_M1");
    printArrayS((float *)h_N1, COLM, COLN, 'N', "h_N1");
    printArrayS((float *)h_M2, ROWM, COLM, 'N', "h_M2");
    printArrayS((float *)h_N2, COLM, COLN, 'N', "h_N2");

    h_Marray[0] = &(h_M1[0][0]);
    h_Marray[1] = &(h_M2[0][0]);
    h_Narray[0] = &(h_N1[0][0]);
    h_Narray[1] = &(h_N2[0][0]);
    h_Parray[0] = &(h_P1[0][0]);
    h_Parray[1] = &(h_P2[0][0]);

    GPU_Multi(h_Marray, h_Narray, h_Parray, ROWM, COLN, COLM, 2, 1.0f, 0.0f);

    printArrayS((float *)h_P1, ROWM, COLN, 'N', "h_P1");
    printArrayS((float *)h_P2, ROWM, COLN, 'N', "h_P2");

    return 0;
}

Result

h_M1
 0.000000  1.000000  2.000000
 1.000000  2.000000  3.000000
 2.000000  3.000000  4.000000
 3.000000  4.000000  5.000000
h_N1
 0.000000  1.000000  2.000000  3.000000  4.000000
 1.000000  2.000000  3.000000  4.000000  5.000000
 2.000000  3.000000  4.000000  5.000000  6.000000
h_M2
 0.000000 -2.000000 -4.000000
 2.000000  0.000000 -2.000000
 4.000000  2.000000  0.000000
 6.000000  4.000000  2.000000
h_N2
 0.000000 -1.000000 -2.000000 -3.000000 -4.000000
 1.000000  0.000000 -1.000000 -2.000000 -3.000000
 2.000000  1.000000  0.000000 -1.000000 -2.000000
h_P1
 5.000000  8.000000  11.000000  14.000000  17.000000
 8.000000  14.000000  20.000000  26.000000  32.000000
 11.000000  20.000000  29.000000  38.000000  47.000000
 14.000000  26.000000  38.000000  50.000000  62.000000
h_P2
-10.000000 -4.000000  2.000000  8.000000  14.000000
-4.000000 -4.000000 -4.000000 -4.000000 -4.000000
 2.000000 -4.000000 -10.000000 -16.000000 -22.000000
 8.000000 -4.000000 -16.000000 -28.000000 -40.000000

How performing multiple matrix multiplications in CUDA?

Question

3 answers

solution1
7 ACCPTED 2014-05-19 17:35:01

solution2
7 2014-05-19 21:04:15

solution3
0 2021-01-21 05:12:17

How performing multiple matrix multiplications in CUDA?

Question

3 answers

solution1 7 ACCPTED 2014-05-19 17:35:01

solution2 7 2014-05-19 21:04:15

solution3 0 2021-01-21 05:12:17

solution1
7 ACCPTED 2014-05-19 17:35:01

solution2
7 2014-05-19 21:04:15

solution3
0 2021-01-21 05:12:17