简体   繁体   English

Cuda 全局函数的工作速度比主机慢

[英]Cuda global Function is working slower, than a host one

I have two voids, which are doing the same thing: multiplication of vector by number.我有两个空,它们在做同样的事情:向量乘以数字。 One is simple c void and the other one is global.一种是简单的 c void,另一种是全局的。 But after measuring time, I found out that regular c function works much faster, than the global one.但是在测量时间后,我发现常规 c 函数的运行速度比全局函数快得多。 Here they are:他们来了:

#define N 1000
__global__ void VectorOnNumber(double *vector1, double number, double *resultVector){
    int tid = threadIdx.x + blockIdx.x*blockDim.x;
    if(tid < N){
        resultVector[tid] = vector1[tid]*number;
    }
}

void von(double vec[N], double n, double res[N]){
    for(int i = 0; i < N; i++){
        res[i] = vec[i]*n;
    }
}

int main(){
    double x[N], u[N];
    double *pointerToVector = (double *)x, *pointerToU = (double *)u;
    
    for(int i = 0; i < N; i++){
        x[i] = i*i;
        v[i] = i+i;
    }

    clock_t start = clock();
    von(x, 10, u);
    clock_t end = clock();
    float seconds = (float)(end - start) / CLOCKS_PER_SEC;
    printf("%f\n", seconds); // 0.03 ms
    
    cudaMalloc(&pointerToVector, N*sizeof(double));
    cudaMalloc(&pointerToU, N*sizeof(double));
    cudaMemcpy(pointerToVector, x, N*sizeof(double), cudaMemcpyHostToDevice);
    cudaMemcpy(pointerToU, u, N*sizeof(double), cudaMemcpyHostToDevice);

    cudaEvent_t start, stop;
    float elapsedTime;
    cudaEventCreate(&start);
    cudaEventRecord(start,0);
    VectorOnNumber<<<N, 1>>>(pointerToVector, 10, pointerToVector);
    cudaEventCreate(&stop);
    cudaEventRecord(stop,0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start,stop);
    printf(%f ms\n" ,elapsedTime);//gives 0.08ms
     

    cudaFree(pointerToVector);
    cudaFree(pointerToU);
    return 0;
}

Why it happens?为什么会发生? Thank you in advance.先感谢您。

There are a lot of flaws: pointerToU is not used, as v , you do not use dim3 grid,threads .有很多缺陷:不使用pointerToU ,如v ,你不使用dim3 grid,threads Anyway I understand there are N blocks with only one thread so your kernel do not benefit of coalesced memory access, which may be the main reason why the cuda version is slower than the cpu version Try无论如何,我知道只有一个线程有N个块,因此您的内核无法从合并内存访问中受益,这可能是 cuda 版本比 cpu 版本慢的主要原因尝试

VectorOnNumber<<<N/32+1,32>>>(pointerToVector, 10, pointerToVector);

here is my code: GPU kernel:这是我的代码:GPU 内核:

void VectorOnNumber(double *vector1, double number, double *resultVector,int N)
{
    dim3 grid(N/256+1),threads(256);

    VectorOnNumber_K<<<grid,threads>>>(vector1, number, resultVector,N);
}

__global__
void VectorOnNumber_K(double *vector1, double number, double *resultVector,int N)
{
    int tid = threadIdx.x + blockIdx.x*blockDim.x;
    if(tid < N){
        resultVector[tid] = vector1[tid]*number;
    }
}


void VectorOnNumberf(float *vector1, float number, float *resultVector,int N)
{
    dim3 grid(N/256+1),threads(256);

    VectorOnNumberf_K<<<grid,threads>>>(vector1, number, resultVector,N);
}

__global__
void VectorOnNumberf_K(float *vector1, float number, float *resultVector,int N)
{
    int tid = threadIdx.x + blockIdx.x*blockDim.x;
    if(tid < N){
        resultVector[tid] = vector1[tid]*number;
    }
}


CPU :中央处理器 :

void Stack_von(double *vec, double n, double *res,int N)
{
int i;
    for(i = 0; i < N; i++){
        res[i] = vec[i]*n;
    }
}
void Stack_vonf(float *vec, float n, float *res,int N)
{
int i;
    for(i = 0; i < N; i++){
        res[i] = vec[i]*n;
    }
}

full test:全面测试:

void Stack()
{
int i,N;
double *x,*u,*dx,*du;
float  *fx,*fu,*dfx,*dfu;

    N=1000000;
    x=new double[N];
    u=new double[N];
    fx=new float[N];
    fu=new float[N];

    
    for(i = 0; i < N; i++){
        x[i] = i*i;
        fx[i] = i*i;
    }

    // cpu
    printf("start\n");
    clock_t start = clock();
    for(int k=0; k < 1000; k++) Stack_von(x, 10, u,N);
    clock_t end = clock();
    float seconds = (float)(end - start) / CLOCKS_PER_SEC;
    printf("host double %f ms\n", seconds); 

    start = clock();
    for(int k=0; k < 1000; k++) Stack_vonf(fx, 10, fu,N);
    end = clock();
    seconds = (float)(end - start) / CLOCKS_PER_SEC;
    printf("host float  %f ms\n", seconds); // 0.03 ms
    
    // gpu
    cudaMalloc(&dfx, N*sizeof(float));
    cudaMalloc(&dfu, N*sizeof(float));
    cudaMemcpy(dfx, fx, N*sizeof(float), cudaMemcpyHostToDevice);
    cudaMalloc(&dx, N*sizeof(double));
    cudaMalloc(&du, N*sizeof(double));
    cudaMemcpy(dx, x, N*sizeof(double), cudaMemcpyHostToDevice);

    cudaEvent_t dstart, dstop;
    float elapsedTime;
    cudaEventCreate(&dstart);
    cudaEventCreate(&dstop);

    cudaEventRecord(dstart,0);
    VectorOnNumber(dx, 10, du,N);
    cudaEventRecord(dstop,0);
    cudaEventSynchronize(dstop);
    cudaEventElapsedTime(&elapsedTime, dstart,dstop);
    printf("device double %f ms\n" ,elapsedTime);

    cudaEventRecord(dstart,0);
    VectorOnNumberf(dfx, 10, dfu,N);
    cudaEventRecord(dstop,0);
    cudaEventSynchronize(dstop);
    cudaEventElapsedTime(&elapsedTime, dstart,dstop);
    printf("device float  %f ms\n" ,elapsedTime);
     

    cudaFree(dx);
    cudaFree(du);
    cudaFree(dfx);
    cudaFree(dfu);
    delete [] x;
    delete [] u;
    delete [] fx;
    delete [] fu;
}

results host double:1.35ms float 0.45ms device double 0.067ms float 0.037ms结果主机双:1.35ms 浮点 0.45ms 设备双 0.067ms 浮点 0.037ms

device (GTX1080) is 10x faster than host (XEON 3.50GHz 8 cores) I set N to 10^6 to make it measurable设备(GTX1080)比主机(XEON 3.50GHz 8 核)快 10 倍我将 N 设置为 10^6 以使其可测量

BUT time is 1.37ms on device if there is only one thread per block !但是如果每个块只有一个线程,设备上的时间为 1.37 毫秒!

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM