简体   繁体   English

cuBLAS矩阵逆运算比MATLAB慢得多

[英]cuBLAS matrix inverse much slower than MATLAB

In my current project, I am attempting to calculate the inverse of a large (n > 2000) matrix with cuBLAS. 在我当前的项目中,我正在尝试使用cuBLAS计算大型(n> 2000)矩阵的逆。 The inverse calculation is performed, but for some reason calculation times are significantly slower than compared to those when done in MATLAB. 进行了逆计算,但是由于某些原因,与在MATLAB中进行计算相比,计算时间明显慢。

I have attached a sample calculation performed on random matrices using my implementation in either language as well as performance results. 我已经附上了使用两种语言以及性能结果对我的随机矩阵执行的示例计算。

Any help or suggestions on what may be causing this slowdown would be greatly appreciated. 对于可能导致此速度下降的任何帮助或建议,将不胜感激。

Thank you in advance. 先感谢您。

Comparison 对照

cuBLAS vs. MATLAB cuBLAS与MATLAB

N = 500 : cuBLAS ~ 0.130 sec, MATLAB ~ 0.066 sec -> ~1.97x slower N = 500:cuBLAS〜0.130秒,MATLAB〜0.066秒->〜1.97倍慢

N = 1000 : cuBLAS ~ 0.898 sec, MATLAB ~ 0.311 sec -> ~2.89x slower N = 1000:cuBLAS〜0.898秒,MATLAB〜0.311秒->〜2.89倍慢

N = 2000 : cuBLAS ~ 6.667 sec, MATLAB ~ 0.659 sec -> ~10.12x slower N = 2000:cuBLAS〜6.​​667秒,MATLAB〜0.659秒->〜10.12x慢

N = 4000 : cuBLAS ~ 51.860 sec, MATLAB ~ 4.296 sec -> ~12.07x slower N = 4000:cuBLAS〜51.860秒,MATLAB〜4.296秒->〜12.07x慢

C++ Code C ++代码

#include <string>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <conio.h>

#define CUDA_CALL(res, str) { if (res != cudaSuccess) { printf("CUDA Error : %s : %s %d : ERR %s\n", str, __FILE__, __LINE__, cudaGetErrorName(res)); } }
#define CUBLAS_CALL(res, str) { if (res != CUBLAS_STATUS_SUCCESS) { printf("CUBLAS Error : %s : %s %d : ERR %d\n", str, __FILE__, __LINE__, int(res)); } }

static cudaEvent_t cu_TimerStart;
static cudaEvent_t cu_TimerStop;

void d_CUDATimerStart(void)
{
    CUDA_CALL(cudaEventCreate(&cu_TimerStart), "Failed to create start event!");
    CUDA_CALL(cudaEventCreate(&cu_TimerStop), "Failed to create stop event!");

    CUDA_CALL(cudaEventRecord(cu_TimerStart), "Failed to record start event!");
}

float d_CUDATimerStop(void)
{
    CUDA_CALL(cudaEventRecord(cu_TimerStop), "Failed to record stop event!");

    CUDA_CALL(cudaEventSynchronize(cu_TimerStop), "Failed to synch stop event!");

    float ms;

    CUDA_CALL(cudaEventElapsedTime(&ms, cu_TimerStart, cu_TimerStop), "Failed to elapse events!");

    CUDA_CALL(cudaEventDestroy(cu_TimerStart), "Failed to destroy start event!");
    CUDA_CALL(cudaEventDestroy(cu_TimerStop), "Failed to destroy stop event!");

    return ms;
}

float* d_GetInv(float* L, int n)
{
    cublasHandle_t cu_cublasHandle;
    CUBLAS_CALL(cublasCreate(&cu_cublasHandle), "Failed to initialize cuBLAS!");

    float** adL;
    float** adC;
    float* dL;
    float* dC;
    int* dLUPivots;
    int* dLUInfo;

    size_t szA = n * n * sizeof(float);

    CUDA_CALL(cudaMalloc(&adL, sizeof(float*)), "Failed to allocate adL!");
    CUDA_CALL(cudaMalloc(&adC, sizeof(float*)), "Failed to allocate adC!");
    CUDA_CALL(cudaMalloc(&dL, szA), "Failed to allocate dL!");
    CUDA_CALL(cudaMalloc(&dC, szA), "Failed to allocate dC!");
    CUDA_CALL(cudaMalloc(&dLUPivots, n * sizeof(int)), "Failed to allocate dLUPivots!");
    CUDA_CALL(cudaMalloc(&dLUInfo, sizeof(int)), "Failed to allocate dLUInfo!");

    CUDA_CALL(cudaMemcpy(dL, L, szA, cudaMemcpyHostToDevice), "Failed to copy to dL!");
    CUDA_CALL(cudaMemcpy(adL, &dL, sizeof(float*), cudaMemcpyHostToDevice), "Failed to copy to adL!");
    CUDA_CALL(cudaMemcpy(adC, &dC, sizeof(float*), cudaMemcpyHostToDevice), "Failed to copy to adC!");

    d_CUDATimerStart();

    CUBLAS_CALL(cublasSgetrfBatched(cu_cublasHandle, n, adL, n, dLUPivots, dLUInfo, 1), "Failed to perform LU decomp operation!");
    CUDA_CALL(cudaDeviceSynchronize(), "Failed to synchronize after kernel call!");

    CUBLAS_CALL(cublasSgetriBatched(cu_cublasHandle, n, (const float **)adL, n, dLUPivots, adC, n, dLUInfo, 1), "Failed to perform Inverse operation!");
    CUDA_CALL(cudaDeviceSynchronize(), "Failed to synchronize after kernel call!");

    float timed = d_CUDATimerStop();

    printf("cublas inverse in: %.5f ms.\n", timed);

    float* res = (float*)malloc(szA);

    CUDA_CALL(cudaMemcpy(res, dC, szA, cudaMemcpyDeviceToHost), "Failed to copy to res!");

    CUDA_CALL(cudaFree(adL), "Failed to free adL!");
    CUDA_CALL(cudaFree(adC), "Failed to free adC!");
    CUDA_CALL(cudaFree(dL), "Failed to free dL!");
    CUDA_CALL(cudaFree(dC), "Failed to free dC!");
    CUDA_CALL(cudaFree(dLUPivots), "Failed to free dLUPivots!");
    CUDA_CALL(cudaFree(dLUInfo), "Failed to free dLUInfo!");

    CUBLAS_CALL(cublasDestroy(cu_cublasHandle), "Failed to destroy cuBLAS!");

    return res;
}

int main()
{
    int n = 1000;
    float* L = (float*)malloc(n * n * sizeof(float));
    for(int i = 0; i < n * n; i++)
        L[i] = ((float)rand()/(float)(RAND_MAX));

    float* inv = d_GetInv(L, n);

    printf("done.");
    _getch();

    return 0;
}

MATLAB Code MATLAB代码

A = rand(1000);
tic
X = inv(A);
toc

System Info: 系统信息:

GPU: GTX 780 3gb GPU:GTX 780 3GB

CPU: i7-4790S @ 3.20 GHz CPU:i7-4790S @ 3.20 GHz

As @RobertCrovella said, you should not use batched small matrix APIs for a single large matrix inversion. 正如@RobertCrovella所说,您不应将批处理的小矩阵API用于单个大矩阵反转。

Basically you could use the same method as in your code, but with the non-batched version of getrf() and getri() to maximum the performance for large matrix. 基本上,您可以使用与代码中相同的方法,但是使用非批处理版本的getrf()getri()来最大化大型矩阵的性能。

For getrf() you could find it here. 对于getrf()您可以在这里找到它。

http://docs.nvidia.com/cuda/cusolver/index.html#cuds-lt-t-gt-getrf http://docs.nvidia.com/cuda/cusolver/index.html#cuds-lt-t-gt-getrf

For getri() , although CUDA toolkit does not provide a getri() to solve AX=I , where A is LU-facotored by getrf() , it does provide a getrs() to solve AX=B . 对于getri() ,尽管CUDA工具包不提供getri()来解决AX=I ,其中Agetrf() LU支持,但确实提供了getrs()来解决AX=B All you need to do is to set B=I before calling getrs() . 您需要做的就是在调用getrs()之前设置B=I

http://docs.nvidia.com/cuda/cusolver/index.html#cuds-lt-t-gt-getrs http://docs.nvidia.com/cuda/cusolver/index.html#cuds-lt-t-gt-getrs

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

相关问题 对于矩阵乘法,Eigen + MKL比Matlab慢 - Eigen + MKL slower than Matlab for matrix multiplication 在使用显式循环的矩阵乘法中,本征比 Fortran 慢得多 - Eigen is much slower than Fortran in matrix multiplication using an explicit loop 为什么犰狳矩阵计算比Fortran慢得多 - why armadillo matrix computation is much slower than Fortran 为什么GTX Titan上的cublas比单线程CPU代码慢? - Why cublas on GTX Titan is slower than single threaded CPU code? 为什么Strassen矩阵乘法比标准矩阵乘法慢得多? - Why is Strassen matrix multiplication so much slower than standard matrix multiplication? 为什么转置 512x512 的矩阵比转置 513x513 的矩阵慢得多? - Why is transposing a matrix of 512x512 much slower than transposing a matrix of 513x513? C ++ Eigen稀疏矩阵乘法比python scipy.sparse慢得多 - C++ Eigen Sparse Matrix multiplication much slower than python scipy.sparse 基准矩阵乘法性能:C ++(特征)比Python慢​​得多 - Benchmarking matrix multiplication performance: C++ (eigen) is much slower than Python 使用SSE计算矩阵乘积比使用直接算法慢得多 - Calculating matrix product is much slower with SSE than with straight-forward-algorithm 英特尔的多线程比AMD慢多了 - Multithreading on Intel much slower than on AMD
 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM