[英]Cuda global Function is working slower, than a host one
我有兩個空,它們在做同樣的事情:向量乘以數字。 一種是簡單的 c void,另一種是全局的。 但是在測量時間后,我發現常規 c 函數的運行速度比全局函數快得多。 他們來了:
#define N 1000
__global__ void VectorOnNumber(double *vector1, double number, double *resultVector){
int tid = threadIdx.x + blockIdx.x*blockDim.x;
if(tid < N){
resultVector[tid] = vector1[tid]*number;
}
}
void von(double vec[N], double n, double res[N]){
for(int i = 0; i < N; i++){
res[i] = vec[i]*n;
}
}
int main(){
double x[N], u[N];
double *pointerToVector = (double *)x, *pointerToU = (double *)u;
for(int i = 0; i < N; i++){
x[i] = i*i;
v[i] = i+i;
}
clock_t start = clock();
von(x, 10, u);
clock_t end = clock();
float seconds = (float)(end - start) / CLOCKS_PER_SEC;
printf("%f\n", seconds); // 0.03 ms
cudaMalloc(&pointerToVector, N*sizeof(double));
cudaMalloc(&pointerToU, N*sizeof(double));
cudaMemcpy(pointerToVector, x, N*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(pointerToU, u, N*sizeof(double), cudaMemcpyHostToDevice);
cudaEvent_t start, stop;
float elapsedTime;
cudaEventCreate(&start);
cudaEventRecord(start,0);
VectorOnNumber<<<N, 1>>>(pointerToVector, 10, pointerToVector);
cudaEventCreate(&stop);
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start,stop);
printf(%f ms\n" ,elapsedTime);//gives 0.08ms
cudaFree(pointerToVector);
cudaFree(pointerToU);
return 0;
}
為什么會發生? 先感謝您。
有很多缺陷:不使用pointerToU
,如v
,你不使用dim3 grid,threads
。 無論如何,我知道只有一個線程有N
個塊,因此您的內核無法從合並內存訪問中受益,這可能是 cuda 版本比 cpu 版本慢的主要原因嘗試
VectorOnNumber<<<N/32+1,32>>>(pointerToVector, 10, pointerToVector);
這是我的代碼:GPU 內核:
void VectorOnNumber(double *vector1, double number, double *resultVector,int N)
{
dim3 grid(N/256+1),threads(256);
VectorOnNumber_K<<<grid,threads>>>(vector1, number, resultVector,N);
}
__global__
void VectorOnNumber_K(double *vector1, double number, double *resultVector,int N)
{
int tid = threadIdx.x + blockIdx.x*blockDim.x;
if(tid < N){
resultVector[tid] = vector1[tid]*number;
}
}
void VectorOnNumberf(float *vector1, float number, float *resultVector,int N)
{
dim3 grid(N/256+1),threads(256);
VectorOnNumberf_K<<<grid,threads>>>(vector1, number, resultVector,N);
}
__global__
void VectorOnNumberf_K(float *vector1, float number, float *resultVector,int N)
{
int tid = threadIdx.x + blockIdx.x*blockDim.x;
if(tid < N){
resultVector[tid] = vector1[tid]*number;
}
}
中央處理器 :
void Stack_von(double *vec, double n, double *res,int N)
{
int i;
for(i = 0; i < N; i++){
res[i] = vec[i]*n;
}
}
void Stack_vonf(float *vec, float n, float *res,int N)
{
int i;
for(i = 0; i < N; i++){
res[i] = vec[i]*n;
}
}
全面測試:
void Stack()
{
int i,N;
double *x,*u,*dx,*du;
float *fx,*fu,*dfx,*dfu;
N=1000000;
x=new double[N];
u=new double[N];
fx=new float[N];
fu=new float[N];
for(i = 0; i < N; i++){
x[i] = i*i;
fx[i] = i*i;
}
// cpu
printf("start\n");
clock_t start = clock();
for(int k=0; k < 1000; k++) Stack_von(x, 10, u,N);
clock_t end = clock();
float seconds = (float)(end - start) / CLOCKS_PER_SEC;
printf("host double %f ms\n", seconds);
start = clock();
for(int k=0; k < 1000; k++) Stack_vonf(fx, 10, fu,N);
end = clock();
seconds = (float)(end - start) / CLOCKS_PER_SEC;
printf("host float %f ms\n", seconds); // 0.03 ms
// gpu
cudaMalloc(&dfx, N*sizeof(float));
cudaMalloc(&dfu, N*sizeof(float));
cudaMemcpy(dfx, fx, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMalloc(&dx, N*sizeof(double));
cudaMalloc(&du, N*sizeof(double));
cudaMemcpy(dx, x, N*sizeof(double), cudaMemcpyHostToDevice);
cudaEvent_t dstart, dstop;
float elapsedTime;
cudaEventCreate(&dstart);
cudaEventCreate(&dstop);
cudaEventRecord(dstart,0);
VectorOnNumber(dx, 10, du,N);
cudaEventRecord(dstop,0);
cudaEventSynchronize(dstop);
cudaEventElapsedTime(&elapsedTime, dstart,dstop);
printf("device double %f ms\n" ,elapsedTime);
cudaEventRecord(dstart,0);
VectorOnNumberf(dfx, 10, dfu,N);
cudaEventRecord(dstop,0);
cudaEventSynchronize(dstop);
cudaEventElapsedTime(&elapsedTime, dstart,dstop);
printf("device float %f ms\n" ,elapsedTime);
cudaFree(dx);
cudaFree(du);
cudaFree(dfx);
cudaFree(dfu);
delete [] x;
delete [] u;
delete [] fx;
delete [] fu;
}
結果主機雙:1.35ms 浮點 0.45ms 設備雙 0.067ms 浮點 0.037ms
設備(GTX1080)比主機(XEON 3.50GHz 8 核)快 10 倍我將 N 設置為 10^6 以使其可測量
但是如果每個塊只有一個線程,設備上的時間為 1.37 毫秒!
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.