[英]cuBLAS matrix inverse much slower than MATLAB
In my current project, I am attempting to calculate the inverse of a large (n > 2000) matrix with cuBLAS. 在我当前的项目中,我正在尝试使用cuBLAS计算大型(n> 2000)矩阵的逆。 The inverse calculation is performed, but for some reason calculation times are significantly slower than compared to those when done in MATLAB.
进行了逆计算,但是由于某些原因,与在MATLAB中进行计算相比,计算时间明显慢。
I have attached a sample calculation performed on random matrices using my implementation in either language as well as performance results. 我已经附上了使用两种语言以及性能结果对我的随机矩阵执行的示例计算。
Any help or suggestions on what may be causing this slowdown would be greatly appreciated. 对于可能导致此速度下降的任何帮助或建议,将不胜感激。
Thank you in advance. 先感谢您。
Comparison 对照
cuBLAS vs. MATLAB
cuBLAS与MATLAB
N = 500 : cuBLAS ~ 0.130 sec, MATLAB ~ 0.066 sec -> ~1.97x slower
N = 500:cuBLAS〜0.130秒,MATLAB〜0.066秒->〜1.97倍慢
N = 1000 : cuBLAS ~ 0.898 sec, MATLAB ~ 0.311 sec -> ~2.89x slower
N = 1000:cuBLAS〜0.898秒,MATLAB〜0.311秒->〜2.89倍慢
N = 2000 : cuBLAS ~ 6.667 sec, MATLAB ~ 0.659 sec -> ~10.12x slower
N = 2000:cuBLAS〜6.667秒,MATLAB〜0.659秒->〜10.12x慢
N = 4000 : cuBLAS ~ 51.860 sec, MATLAB ~ 4.296 sec -> ~12.07x slower
N = 4000:cuBLAS〜51.860秒,MATLAB〜4.296秒->〜12.07x慢
C++ Code C ++代码
#include <string>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <conio.h>
#define CUDA_CALL(res, str) { if (res != cudaSuccess) { printf("CUDA Error : %s : %s %d : ERR %s\n", str, __FILE__, __LINE__, cudaGetErrorName(res)); } }
#define CUBLAS_CALL(res, str) { if (res != CUBLAS_STATUS_SUCCESS) { printf("CUBLAS Error : %s : %s %d : ERR %d\n", str, __FILE__, __LINE__, int(res)); } }
static cudaEvent_t cu_TimerStart;
static cudaEvent_t cu_TimerStop;
void d_CUDATimerStart(void)
{
CUDA_CALL(cudaEventCreate(&cu_TimerStart), "Failed to create start event!");
CUDA_CALL(cudaEventCreate(&cu_TimerStop), "Failed to create stop event!");
CUDA_CALL(cudaEventRecord(cu_TimerStart), "Failed to record start event!");
}
float d_CUDATimerStop(void)
{
CUDA_CALL(cudaEventRecord(cu_TimerStop), "Failed to record stop event!");
CUDA_CALL(cudaEventSynchronize(cu_TimerStop), "Failed to synch stop event!");
float ms;
CUDA_CALL(cudaEventElapsedTime(&ms, cu_TimerStart, cu_TimerStop), "Failed to elapse events!");
CUDA_CALL(cudaEventDestroy(cu_TimerStart), "Failed to destroy start event!");
CUDA_CALL(cudaEventDestroy(cu_TimerStop), "Failed to destroy stop event!");
return ms;
}
float* d_GetInv(float* L, int n)
{
cublasHandle_t cu_cublasHandle;
CUBLAS_CALL(cublasCreate(&cu_cublasHandle), "Failed to initialize cuBLAS!");
float** adL;
float** adC;
float* dL;
float* dC;
int* dLUPivots;
int* dLUInfo;
size_t szA = n * n * sizeof(float);
CUDA_CALL(cudaMalloc(&adL, sizeof(float*)), "Failed to allocate adL!");
CUDA_CALL(cudaMalloc(&adC, sizeof(float*)), "Failed to allocate adC!");
CUDA_CALL(cudaMalloc(&dL, szA), "Failed to allocate dL!");
CUDA_CALL(cudaMalloc(&dC, szA), "Failed to allocate dC!");
CUDA_CALL(cudaMalloc(&dLUPivots, n * sizeof(int)), "Failed to allocate dLUPivots!");
CUDA_CALL(cudaMalloc(&dLUInfo, sizeof(int)), "Failed to allocate dLUInfo!");
CUDA_CALL(cudaMemcpy(dL, L, szA, cudaMemcpyHostToDevice), "Failed to copy to dL!");
CUDA_CALL(cudaMemcpy(adL, &dL, sizeof(float*), cudaMemcpyHostToDevice), "Failed to copy to adL!");
CUDA_CALL(cudaMemcpy(adC, &dC, sizeof(float*), cudaMemcpyHostToDevice), "Failed to copy to adC!");
d_CUDATimerStart();
CUBLAS_CALL(cublasSgetrfBatched(cu_cublasHandle, n, adL, n, dLUPivots, dLUInfo, 1), "Failed to perform LU decomp operation!");
CUDA_CALL(cudaDeviceSynchronize(), "Failed to synchronize after kernel call!");
CUBLAS_CALL(cublasSgetriBatched(cu_cublasHandle, n, (const float **)adL, n, dLUPivots, adC, n, dLUInfo, 1), "Failed to perform Inverse operation!");
CUDA_CALL(cudaDeviceSynchronize(), "Failed to synchronize after kernel call!");
float timed = d_CUDATimerStop();
printf("cublas inverse in: %.5f ms.\n", timed);
float* res = (float*)malloc(szA);
CUDA_CALL(cudaMemcpy(res, dC, szA, cudaMemcpyDeviceToHost), "Failed to copy to res!");
CUDA_CALL(cudaFree(adL), "Failed to free adL!");
CUDA_CALL(cudaFree(adC), "Failed to free adC!");
CUDA_CALL(cudaFree(dL), "Failed to free dL!");
CUDA_CALL(cudaFree(dC), "Failed to free dC!");
CUDA_CALL(cudaFree(dLUPivots), "Failed to free dLUPivots!");
CUDA_CALL(cudaFree(dLUInfo), "Failed to free dLUInfo!");
CUBLAS_CALL(cublasDestroy(cu_cublasHandle), "Failed to destroy cuBLAS!");
return res;
}
int main()
{
int n = 1000;
float* L = (float*)malloc(n * n * sizeof(float));
for(int i = 0; i < n * n; i++)
L[i] = ((float)rand()/(float)(RAND_MAX));
float* inv = d_GetInv(L, n);
printf("done.");
_getch();
return 0;
}
MATLAB Code MATLAB代码
A = rand(1000);
tic
X = inv(A);
toc
System Info: 系统信息:
GPU: GTX 780 3gb GPU:GTX 780 3GB
CPU: i7-4790S @ 3.20 GHz CPU:i7-4790S @ 3.20 GHz
As @RobertCrovella said, you should not use batched small matrix APIs for a single large matrix inversion. 正如@RobertCrovella所说,您不应将批处理的小矩阵API用于单个大矩阵反转。
Basically you could use the same method as in your code, but with the non-batched version of getrf()
and getri()
to maximum the performance for large matrix. 基本上,您可以使用与代码中相同的方法,但是使用非批处理版本的
getrf()
和getri()
来最大化大型矩阵的性能。
For getrf()
you could find it here. 对于
getrf()
您可以在这里找到它。
http://docs.nvidia.com/cuda/cusolver/index.html#cuds-lt-t-gt-getrf http://docs.nvidia.com/cuda/cusolver/index.html#cuds-lt-t-gt-getrf
For getri()
, although CUDA toolkit does not provide a getri()
to solve AX=I
, where A
is LU-facotored by getrf()
, it does provide a getrs()
to solve AX=B
. 对于
getri()
,尽管CUDA工具包不提供getri()
来解决AX=I
,其中A
由getrf()
LU支持,但确实提供了getrs()
来解决AX=B
All you need to do is to set B=I
before calling getrs()
. 您需要做的就是在调用
getrs()
之前设置B=I
http://docs.nvidia.com/cuda/cusolver/index.html#cuds-lt-t-gt-getrs http://docs.nvidia.com/cuda/cusolver/index.html#cuds-lt-t-gt-getrs
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.