如何在 CUDA 中执行多个矩阵乘法？

Question

I have an array of square matrices int *M[10];我有一个方阵数组int *M[10]; so that M[i] locates the first element of the i -th matrix.以便M[i]定位第i个矩阵的第一个元素。 I want to multiply all the matrices M[i] by another matrix N , so that I receive an array of square matrices int *P[10] as output.我想将所有矩阵M[i]乘以另一个矩阵N ，以便我收到一个方阵数组int *P[10]作为输出。

There are different possibilities I see:我看到了不同的可能性：

Assing the computation of a different element of M[i] to a different thread;将M[i]的不同元素的计算分配给不同的线程； for example, I have 10 matrices, 4x4 sized, so that the number of involved threads would be 160 ;例如，我有10矩阵，大小为4x4 ，因此涉及的线程数为160 ； how to use CUDA to implement this approach?如何使用 CUDA 来实现这种方法？
In the framework of the example above, creating a composite matrix size 40x40 (ie, collecting 10 , 4x4 sized matrices together) and use 40x40 threads;在上面例子的框架中，创建一个40x40大小的复合矩阵（即收集10 、 4x4大小的矩阵一起）并使用40x40线程； but this approach seems to require more time;但这种方法似乎需要更多时间； I'm trying with the array of matrices, but I think I'm doing something wrong;我正在尝试使用矩阵数组，但我认为我做错了什么； how can I use this approach with 10 matrices?如何将这种方法用于10矩阵？ How to code it in Kernel function?如何在内核函数中对其进行编码？

This is what I'm trying;这就是我正在尝试的；

void GPU_Multi(int *M[2], int *N, int *P[2], size_t width)
{

    int *devM[2];
    int *devN[2];
    int *devP[2];
    size_t allocasize =sizeof(int) *width*width;

    for(int i = 0 ; i < 10 ; i ++ ) 
    {
        cudaMalloc((void**)&devM[ i ], allocasize );
        cudaMalloc((void**)&devP[ i ], allocasize ); 
    }

    cudaMalloc((void**)&devN, allocasize );

    for(int i = 0 ; i < 10 ; i ++ ) {

        cudaMemcpy(devM[ i ],M[ i ], allocasize , cudaMemcpyHostToDevice);
        cudaMemcpy(devN, N, allocasize , cudaMemcpyHostToDevice);
        dim3 block(width*2, width*2);
        dim3 grid(1,1,1);
        Kernel_Function<<<grid, block>>>  (devM[2], devN, devP[2],width);

        for(int i = 0 ; i < 10 ; i ++ ) 
        {
            cudaMemcpy(P[ i ], P[ i ], allocatesize, cudaMemcpyDeviceToHost);
            cudaFree(devM[ i ]);   
            cudaFree(devP[ i ]);
        }

    }

Answer 1

I think it's likely that the fastest performance will be achieved by using the CUBLAS batch gemm function which was specifically designed for this purpose (performing a large number of "relatively small" matrix-matrix multiply operations).我认为通过使用专门为此目的设计的CUBLAS 批处理 gemm 函数（执行大量“相对较小”的矩阵-矩阵乘法运算）可能会实现最快的性能。

Even though you want to multiply your array of matrices ( M[] ) by a single matrix ( N ), the batch gemm function will require you to pass also an array of matrices for N (ie N[] ), which will all be the same in your case.~~即使您想将矩阵数组 ( M[] ) 乘以单个矩阵 ( N )，批处理 gemm 函数也将要求您传递N的矩阵数组 (即N[] )，它们都是你的情况也是如此。~~

EDIT: Now that I have worked thru an example, it seems clear to me that with a modification to the example below, we can pass a single N matrix and have the GPU_Multi function simply send the single N matrix to the device, while passing an array of pointers for N , ie d_Narray in the example below, with all of the pointers pointing to the same N matrix on the device.编辑：现在我已经通过了一个例子，我似乎很清楚，通过修改下面的例子，我们可以传递一个N矩阵，并让GPU_Multi函数简单地将单个N矩阵发送到设备，同时传递一个N的指针数组，即下面示例中的d_Narray ，所有指针都指向设备上的相同N矩阵。

Here is a fully worked batch GEMM example:这是一个完整的批处理 GEMM 示例：

#include <stdio.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <assert.h>

#define ROWM 4
#define COLM 3
#define COLN 5

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)


typedef float mytype;
// Pi = Mi x Ni
// pr = P rows = M rows
// pc = P cols = N cols
// mc = M cols = N rows
void GPU_Multi(mytype **M, mytype **N, mytype **P
  , size_t pr, size_t pc, size_t mc
  , size_t num_mat, mytype alpha, mytype beta)
{

    mytype *devM[num_mat];
    mytype *devN[num_mat];
    mytype *devP[num_mat];
    size_t p_size =sizeof(mytype) *pr*pc;
    size_t m_size =sizeof(mytype) *pr*mc;
    size_t n_size =sizeof(mytype) *mc*pc;
    const mytype **d_Marray, **d_Narray;
    mytype **d_Parray;
    cublasHandle_t myhandle;
    cublasStatus_t cublas_result;

    for(int i = 0 ; i < num_mat; i ++ )
    {
        cudaMalloc((void**)&devM[ i ], m_size );
        cudaMalloc((void**)&devN[ i ], n_size );
        cudaMalloc((void**)&devP[ i ], p_size );
    }
    cudaMalloc((void**)&d_Marray, num_mat*sizeof(mytype *));
    cudaMalloc((void**)&d_Narray, num_mat*sizeof(mytype *));
    cudaMalloc((void**)&d_Parray, num_mat*sizeof(mytype *));
    cudaCheckErrors("cudaMalloc fail");
    for(int i = 0 ; i < num_mat; i ++ ) {

        cudaMemcpy(devM[i], M[i], m_size , cudaMemcpyHostToDevice);
        cudaMemcpy(devN[i], N[i], n_size , cudaMemcpyHostToDevice);
        cudaMemcpy(devP[i], P[i], p_size , cudaMemcpyHostToDevice);
    }
    cudaMemcpy(d_Marray, devM, num_mat*sizeof(mytype *), cudaMemcpyHostToDevice);
    cudaMemcpy(d_Narray, devN, num_mat*sizeof(mytype *), cudaMemcpyHostToDevice);
    cudaMemcpy(d_Parray, devP, num_mat*sizeof(mytype *), cudaMemcpyHostToDevice);
    cudaCheckErrors("cudaMemcpy H2D fail");
    cublas_result = cublasCreate(&myhandle);
    assert(cublas_result == CUBLAS_STATUS_SUCCESS);
    // change to    cublasDgemmBatched for double
    cublas_result = cublasSgemmBatched(myhandle, CUBLAS_OP_N, CUBLAS_OP_N
      , pr, pc, mc
      , &alpha, d_Marray, pr, d_Narray, mc
      , &beta, d_Parray, pr
      , num_mat);
    assert(cublas_result == CUBLAS_STATUS_SUCCESS);

    for(int i = 0 ; i < num_mat ; i ++ )
    {
        cudaMemcpy(P[i], devP[i], p_size, cudaMemcpyDeviceToHost);
        cudaFree(devM[i]);
        cudaFree(devN[i]);
        cudaFree(devP[i]);
    }
    cudaFree(d_Marray);
    cudaFree(d_Narray);
    cudaFree(d_Parray);
    cudaCheckErrors("cudaMemcpy D2H fail");

}

int main(){

  mytype h_M1[ROWM][COLM], h_M2[ROWM][COLM];
  mytype h_N1[COLM][COLN], h_N2[COLM][COLN];
  mytype h_P1[ROWM][COLN], h_P2[ROWM][COLN];
  mytype *h_Marray[2], *h_Narray[2], *h_Parray[2];
  for (int i = 0; i < ROWM; i++)
    for (int j = 0; j < COLM; j++){
      h_M1[i][j] = 1.0f; h_M2[i][j] = 2.0f;}
  for (int i = 0; i < COLM; i++)
    for (int j = 0; j < COLN; j++){
      h_N1[i][j] = 1.0f; h_N2[i][j] = 1.0f;}
  for (int i = 0; i < ROWM; i++)
    for (int j = 0; j < COLN; j++){
      h_P1[i][j] = 0.0f; h_P2[i][j] = 0.0f;}

  h_Marray[0] = &(h_M1[0][0]);
  h_Marray[1] = &(h_M2[0][0]);
  h_Narray[0] = &(h_N1[0][0]);
  h_Narray[1] = &(h_N2[0][0]);
  h_Parray[0] = &(h_P1[0][0]);
  h_Parray[1] = &(h_P2[0][0]);

  GPU_Multi(h_Marray, h_Narray, h_Parray, ROWM, COLN, COLM, 2, 1.0f, 0.0f);
  for (int i = 0; i < ROWM; i++)
    for (int j = 0; j < COLN; j++){
      if (h_P1[i][j] != COLM*1.0f)
      {
        printf("h_P1 mismatch at %d,%d was: %f should be: %f\n"
          , i, j, h_P1[i][j], COLM*1.0f); return 1;
      }
      if (h_P2[i][j] != COLM*2.0f)
      {
        printf("h_P2 mismatch at %d,%d was: %f should be: %f\n"
          , i, j, h_P2[i][j], COLM*2.0f); return 1;
      }
    }
  printf("Success!\n");
  return 0;
}

Answer 2

As it emerged from the comments above and the answer by Robert Crovella, there are different possible approaches.从上面的评论和罗伯特·克罗维拉的回答中可以看出，有不同的可能方法。 Each of the approach can be better suited for a different situation, ie, for a different number N of matrices to multiply and for different matrix dimensions MxM .每种方法都可以更好地适用于不同的情况，即不同数量N的要相乘的矩阵和不同的矩阵维度MxM 。 Let me summarize them below:让我总结如下：

If N is small and M is large, perhaps the best approach would be to use cublas<t>gemm called from host code;如果N小而M大，也许最好的方法是使用从主机代码调用的cublas<t>gemm ；
If N is moderate and M is moderate, and if a device with compute capability of at least 3.5 is available, then a good possibility would be to use dynamic parallelism, namely, creating a thread grid of N threads and launching a cublas<t>gemm from within a kernel;如果N适中， M适中，并且如果有至少3.5计算能力的设备可用，那么一个很好的可能性是使用动态并行，即创建N线程的线程网格并启动cublas<t>gemm内核中的cublas<t>gemm ； perhaps this approach would fail for large N or M due to the large number of threads required;由于需要大量线程，这种方法对于大N或M可能会失败；
If N is large and M is small, then the cuBLAS batched approach linked to by Robert Crovella could be of interest;如果N很大而M很小，那么 Robert Crovella 链接的 cuBLAS 批处理方法可能会引起人们的兴趣；
Similarly, if N is large and M is small, then a cuBLAS stream-based approach would be worth a try, as also mentioned in Robert's comment;同样，如果N大而M小，那么基于 cuBLAS 流的方法值得一试，正如 Robert 的评论中所述；
If N is large and M is very small, an approach using a thread grid of N threads, each "manually" calculating an optimized matrix multiplication could be appealing;如果N很大而M很小，则使用N线程的线程网格的方法，每个线程都“手动”计算优化的矩阵乘法可能很有吸引力； for example, if one has to construct a matrix multiplication algorithm for 4x4 matrices, then one could optimize the matrix multiplication performed by each thread according to Number of elementary multiplications for multiplying 4x4 matrices .例如，如果必须为4x4矩阵构建矩阵乘法算法，则可以根据乘法 4x4 矩阵的基本乘法数优化每个线程执行的矩阵乘法。

Answer 3

In case data is stored in host memory in row major order and we wish to perform matrix multiplication and retrieve the data back in row major order, the below code does that如果数据以行主要顺序存储在主机内存中，并且我们希望执行矩阵乘法并按行主要顺序检索数据，则以下代码执行此操作

#include <stdio.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <assert.h>

#define ROWM 4
#define COLM 3
#define COLN 5

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

void printArrayS(float *ptr, int rows, int cols, char mode, char *name)
{
    printf("%s\n", name);
    for (int i = 0; i < rows; i++)
    {
        for (int j = 0; j < cols; j++)
        {
            if (mode == 'N') /* Normal mode */
            {
                if (ptr[i * cols + j] >= 0)
                    printf(" %3.6f ", ptr[i * cols + j]);
                else
                    printf("%3.6f ", ptr[i * cols + j]);
            }
            else /* Transpose mode */
            {
                if (ptr[j * rows + i] >= 0)
                    printf("%3.6f ", ptr[j * rows + i]);
                else
                    printf("%3.6f ", ptr[j * rows + i]);
            }
        }
        printf("\n");
    }
}

typedef float mytype;
// Pi = Mi x Ni
// pr = P rows = M rows
// pc = P cols = N cols
// mc = M cols = N rows
void GPU_Multi(mytype **M, mytype **N, mytype **P,
               size_t pr, size_t pc, size_t mc,
               size_t num_mat, mytype alpha, mytype beta)
{
    #define NUM_MAT   2

    mytype *devM[NUM_MAT];
    mytype *devN[NUM_MAT];
    mytype *devP[NUM_MAT];
    size_t p_size = sizeof(mytype) * pr * pc;
    size_t m_size = sizeof(mytype) * pr * mc;
    size_t n_size = sizeof(mytype) * mc * pc;
    const mytype **d_Marray, **d_Narray;
    mytype **d_Parray;
    cublasHandle_t myhandle;
    cublasStatus_t cublas_result;

    for (int i = 0; i < NUM_MAT; i++)
    {
        cudaMalloc((void **)&devM[i], m_size);
        cudaMalloc((void **)&devN[i], n_size);
        cudaMalloc((void **)&devP[i], p_size);
    }
    cudaMalloc((void **)&d_Marray, NUM_MAT * sizeof(mytype *));
    cudaMalloc((void **)&d_Narray, NUM_MAT * sizeof(mytype *));
    cudaMalloc((void **)&d_Parray, NUM_MAT * sizeof(mytype *));
    cudaCheckErrors("cudaMalloc fail");
    for (int i = 0; i < NUM_MAT; i++) {

        cudaMemcpy(devM[i], M[i], m_size, cudaMemcpyHostToDevice);
        cudaMemcpy(devN[i], N[i], n_size, cudaMemcpyHostToDevice);
        cudaMemcpy(devP[i], P[i], p_size, cudaMemcpyHostToDevice);
    }
    cudaMemcpy(d_Marray, devM, NUM_MAT * sizeof(mytype *), cudaMemcpyHostToDevice);
    cudaMemcpy(d_Narray, devN, NUM_MAT * sizeof(mytype *), cudaMemcpyHostToDevice);
    cudaMemcpy(d_Parray, devP, NUM_MAT * sizeof(mytype *), cudaMemcpyHostToDevice);
    cudaCheckErrors("cudaMemcpy H2D fail");
    cublas_result = cublasCreate(&myhandle);
    assert(cublas_result == CUBLAS_STATUS_SUCCESS);
    // change to    cublasDgemmBatched for double
    cublas_result = cublasSgemmBatched(myhandle, CUBLAS_OP_N, CUBLAS_OP_N
                                       , pc, pr, mc
                                       , &alpha, d_Narray, pc, d_Marray, mc
                                       , &beta, d_Parray, pc
                                       , NUM_MAT);
    assert(cublas_result == CUBLAS_STATUS_SUCCESS);

    for (int i = 0; i < NUM_MAT; i++)
    {
        cudaMemcpy(P[i], devP[i], p_size, cudaMemcpyDeviceToHost);
        cudaFree(devM[i]);
        cudaFree(devN[i]);
        cudaFree(devP[i]);
    }
    cudaFree(d_Marray);
    cudaFree(d_Narray);
    cudaFree(d_Parray);
    cudaCheckErrors("cudaMemcpy D2H fail");

}

int main() {

    mytype h_M1[ROWM][COLM], h_M2[ROWM][COLM];
    mytype h_N1[COLM][COLN], h_N2[COLM][COLN];
    mytype h_P1[ROWM][COLN], h_P2[ROWM][COLN];
    mytype *h_Marray[2], *h_Narray[2], *h_Parray[2];
    for (int i = 0; i < ROWM; i++)
        for (int j = 0; j < COLM; j++) {
            h_M1[i][j] = (i + j) * 1.0f; h_M2[i][j] = (i - j) * 2.0f;
        }
    for (int i = 0; i < COLM; i++)
        for (int j = 0; j < COLN; j++) {
            h_N1[i][j] = (i + j) * 1.0f; h_N2[i][j] = (i - j) * 1.0f;
        }
    for (int i = 0; i < ROWM; i++)
        for (int j = 0; j < COLN; j++) {
            h_P1[i][j] = 0.0f; h_P2[i][j] = 0.0f;
        }

    printArrayS((float *)h_M1, ROWM, COLM, 'N', "h_M1");
    printArrayS((float *)h_N1, COLM, COLN, 'N', "h_N1");
    printArrayS((float *)h_M2, ROWM, COLM, 'N', "h_M2");
    printArrayS((float *)h_N2, COLM, COLN, 'N', "h_N2");

    h_Marray[0] = &(h_M1[0][0]);
    h_Marray[1] = &(h_M2[0][0]);
    h_Narray[0] = &(h_N1[0][0]);
    h_Narray[1] = &(h_N2[0][0]);
    h_Parray[0] = &(h_P1[0][0]);
    h_Parray[1] = &(h_P2[0][0]);

    GPU_Multi(h_Marray, h_Narray, h_Parray, ROWM, COLN, COLM, 2, 1.0f, 0.0f);

    printArrayS((float *)h_P1, ROWM, COLN, 'N', "h_P1");
    printArrayS((float *)h_P2, ROWM, COLN, 'N', "h_P2");

    return 0;
}

Result结果

h_M1
 0.000000  1.000000  2.000000
 1.000000  2.000000  3.000000
 2.000000  3.000000  4.000000
 3.000000  4.000000  5.000000
h_N1
 0.000000  1.000000  2.000000  3.000000  4.000000
 1.000000  2.000000  3.000000  4.000000  5.000000
 2.000000  3.000000  4.000000  5.000000  6.000000
h_M2
 0.000000 -2.000000 -4.000000
 2.000000  0.000000 -2.000000
 4.000000  2.000000  0.000000
 6.000000  4.000000  2.000000
h_N2
 0.000000 -1.000000 -2.000000 -3.000000 -4.000000
 1.000000  0.000000 -1.000000 -2.000000 -3.000000
 2.000000  1.000000  0.000000 -1.000000 -2.000000
h_P1
 5.000000  8.000000  11.000000  14.000000  17.000000
 8.000000  14.000000  20.000000  26.000000  32.000000
 11.000000  20.000000  29.000000  38.000000  47.000000
 14.000000  26.000000  38.000000  50.000000  62.000000
h_P2
-10.000000 -4.000000  2.000000  8.000000  14.000000
-4.000000 -4.000000 -4.000000 -4.000000 -4.000000
 2.000000 -4.000000 -10.000000 -16.000000 -22.000000
 8.000000 -4.000000 -16.000000 -28.000000 -40.000000

如何在 CUDA 中执行多个矩阵乘法？

问题描述

3 个解决方案

解决方案1
7 已采纳 2014-05-19 17:35:01

解决方案2
7 2014-05-19 21:04:15

解决方案3
0 2021-01-21 05:12:17

如何在 CUDA 中执行多个矩阵乘法？

问题描述

3 个解决方案

解决方案1 7 已采纳 2014-05-19 17:35:01

解决方案2 7 2014-05-19 21:04:15

解决方案3 0 2021-01-21 05:12:17

解决方案1
7 已采纳 2014-05-19 17:35:01

解决方案2
7 2014-05-19 21:04:15

解决方案3
0 2021-01-21 05:12:17