使用CUDA進行矩陣乘法，執行時間長

Question

我是CUDA的新手，我一直試圖找出我在這里做錯了什么。 CUDA花費的時間比僅使用CPU乘以矩陣要長。 如果我做錯了，請告訴我。 這是我的代碼：

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <cstdlib>
#include <assert.h>
#include <time.h>
#define size 100   // Matrix size
#define cols size   // Matrix width
#define rows size   // Matrix height

void checkCUDAError(const char *msg)
{
    cudaError_t err = cudaGetLastError();
    if( cudaSuccess != err) 
    {
        fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
        exit(EXIT_FAILURE);
    }                         
}
__global__ void matrixMul( int *A, int *B, int *C)
{   
    int bx = blockIdx.x; // Block index
    int tx = threadIdx.x; // Thread index
    int ts = blockDim.x; // number of threads   
    // Declaration of the shared memory C element
    extern __shared__ int c_element_sum[];
    c_element_sum[tx] = A[tx+((bx/ts)*ts)] * B[(bx%ts)+(tx*ts)];

    //Block until all threads in the block have written their data to shared mem
    __syncthreads();

    int sum;
    for(int i=0; i<ts; i++){
        if(i==0){
            sum=c_element_sum[i];
        }
        else{
            sum+=c_element_sum[i];
        }
    }
    C[bx] = sum;

}


/////////////////////////////////////////////////////////
// Program main
/////////////////////////////////////////////////////////

int main(int argc, char** argv)
{
   //create timer.
   clock_t t1, t2;

   //start timer
   t1=clock();

   //allocate host memory for matrices
   unsigned int size_A = cols * rows;
   unsigned int mem_size_A = sizeof(int) * size_A;
   int* mA = (int*) malloc(mem_size_A);

   unsigned int size_B = cols * rows;
   unsigned int mem_size_B = sizeof(int) * size_B;
   int* mB = (int*) malloc(mem_size_B);

   unsigned int size_C = cols * rows;
   unsigned int mem_size_C = sizeof(int) * size_C;
   int* mC = (int*) malloc(mem_size_C);

   //initialize host memory
   for (int i = 0; i < size_A; ++i){
       mA[i] = 1;
       mB[i] = 1;
       mC[i] = 0;
   }

   // allocate device memory
   int* d_mA;
   int* d_mB;
   int* d_mC;
   cudaMalloc((void**) &d_mA, mem_size_A);
   cudaMalloc((void**) &d_mB, mem_size_B);
   cudaMalloc((void**) &d_mC, mem_size_C);

   //copy host memory to device (A and B)
   cudaMemcpy(d_mA, mA, mem_size_A, cudaMemcpyHostToDevice);
   cudaMemcpy(d_mB, mB, mem_size_B, cudaMemcpyHostToDevice);
   cudaMemcpy(d_mC, mC, mem_size_C, cudaMemcpyHostToDevice);

   // setup execution parameters
   int numThreadsPerBlock = cols;
   int numBlocks = (cols * rows);
   int sharedMemSize = numThreadsPerBlock * sizeof(int);

   dim3 dimGrid(numBlocks);
   dim3 dimBlock(numThreadsPerBlock);

   // execute the kernel
   matrixMul <<< dimGrid, dimBlock, sharedMemSize >>>(d_mA, d_mB, d_mC);

   //Block until device has completed
   cudaThreadSynchronize();

   // check if kernel execution generated an error
   // Check for any CUDA errors
   checkCUDAError("kernel invocation");

   //copy result from device to host
   cudaMemcpy(mC, d_mC, mem_size_C, cudaMemcpyDeviceToHost);

   // Check for any CUDA errors
   checkCUDAError("memcpy");

   //stop timer
   t2 = clock();

   //check results
   for (int i = 0; i < size_C; ++i){
       assert(mC[i] == cols);
   }

   //clean up memory
   free(mA);
   free(mB);
   free(mC);
   cudaFree(d_mA);
   cudaFree(d_mB);
   cudaFree(d_mC);

   printf("WITH CUDA - clocks: %d \n\n", t2-t1);

   //////////////////////////////
   ///////// CPU ONLY //////////
   /////////////////////////////

   //create timer.
   clock_t cpu_t1, cpu_t2;

   //start timer
   cpu_t1=clock();

   //allocate host memory for matrices
   unsigned int cpu_size_A = cols * rows;
   unsigned int cpu_mem_size_A = sizeof(int) * cpu_size_A;
   int* cpu_mA = (int*) malloc(cpu_mem_size_A);

   unsigned int cpu_size_B = cols * rows;
   unsigned int cpu_mem_size_B = sizeof(int) * cpu_size_B;
   int* cpu_mB = (int*) malloc(cpu_mem_size_B);

   unsigned int cpu_size_C = cols * rows;
   unsigned int cpu_mem_size_C = sizeof(int) * cpu_size_C;
   int* cpu_mC = (int*) malloc(cpu_mem_size_C);

   //initialize host memory
   for (int i = 0; i < cpu_size_A; ++i){
       cpu_mA[i] = 1;
       cpu_mB[i] = 1;
       cpu_mC[i] = 0;
   }

   int ts = cols;
   for(int bx=0; bx<(cols*rows);bx++){
       int sum = 0;
       for(int tx=0; tx<cols; tx++){
          sum += cpu_mA[tx+((bx/ts)*ts)] * cpu_mB[(bx%ts)+(tx*ts)];
       }
       cpu_mC[bx]=sum;
   }

   //stop timer
   cpu_t2 = clock();

   //check results
   for (int i = 0; i < cpu_size_C; ++i){
       assert(cpu_mC[i] == cols);
   }

   //clean up memory
   free(cpu_mA);
   free(cpu_mB);
   free(cpu_mC);

   printf("CPU ONLY - clocks: %d \n\n", cpu_t2-cpu_t1);

   return 0;
}

Answer 1

根據您的計划，這是預期的。 您的計時器看起來像計時程序的整個執行時鍾，包括復制到設備，計算時間和復制結果。 鑒於您為程序提供的工作量相當小（100x100矩陣），內存副本的開銷遠遠超過使用內核進行計算時獲得的任何計算優勢。 您的內核本身也不是最有效的實現。

我不認為你做錯什么，只是，你還沒有規定的GPU工作的足夠大的塊，你可能會進一步優化內核。 請注意，簡單地擴大塊的大小可能不會顯着提高CPU的性能，因為您還會擴大內存管理時間。 雖然在CUDA上編寫程序的第一個實現相對簡單，但要從中獲得良好的性能要困難得多。 使用CUDA的最有效方法是使計算與內存事務的比率較高。 例如，擁有多個計算密集型內核的管道，可以在一塊數據上連續運行，只需要在開始和結束時進行主機設備復制。

如果這只是一個幫助您學習CUDA代碼的程序，那么這是一個很好的步驟，深入了解如何優化矩陣乘法內核將在很多其他情況下很好地為您服務。 如果您正在編寫此內核以用於生產軟件，我建議您使用高度優化的線性代數庫CUBLAS： http ： //developer.nvidia.com/cublas （或其他一些努力工作的庫）已經為你完成了）。

使用CUDA進行矩陣乘法，執行時間長

問題描述

1 個解決方案

解決方案1
7 已采納 2012-04-02 04:24:18

使用CUDA進行矩陣乘法，執行時間長

問題描述

1 個解決方案

解決方案1 7 已采納 2012-04-02 04:24:18

解決方案1
7 已采納 2012-04-02 04:24:18