简体   繁体   中英

Cuda global Function is working slower, than a host one

I have two voids, which are doing the same thing: multiplication of vector by number. One is simple c void and the other one is global. But after measuring time, I found out that regular c function works much faster, than the global one. Here they are:

#define N 1000
__global__ void VectorOnNumber(double *vector1, double number, double *resultVector){
    int tid = threadIdx.x + blockIdx.x*blockDim.x;
    if(tid < N){
        resultVector[tid] = vector1[tid]*number;
    }
}

void von(double vec[N], double n, double res[N]){
    for(int i = 0; i < N; i++){
        res[i] = vec[i]*n;
    }
}

int main(){
    double x[N], u[N];
    double *pointerToVector = (double *)x, *pointerToU = (double *)u;
    
    for(int i = 0; i < N; i++){
        x[i] = i*i;
        v[i] = i+i;
    }

    clock_t start = clock();
    von(x, 10, u);
    clock_t end = clock();
    float seconds = (float)(end - start) / CLOCKS_PER_SEC;
    printf("%f\n", seconds); // 0.03 ms
    
    cudaMalloc(&pointerToVector, N*sizeof(double));
    cudaMalloc(&pointerToU, N*sizeof(double));
    cudaMemcpy(pointerToVector, x, N*sizeof(double), cudaMemcpyHostToDevice);
    cudaMemcpy(pointerToU, u, N*sizeof(double), cudaMemcpyHostToDevice);

    cudaEvent_t start, stop;
    float elapsedTime;
    cudaEventCreate(&start);
    cudaEventRecord(start,0);
    VectorOnNumber<<<N, 1>>>(pointerToVector, 10, pointerToVector);
    cudaEventCreate(&stop);
    cudaEventRecord(stop,0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start,stop);
    printf(%f ms\n" ,elapsedTime);//gives 0.08ms
     

    cudaFree(pointerToVector);
    cudaFree(pointerToU);
    return 0;
}

Why it happens? Thank you in advance.

There are a lot of flaws: pointerToU is not used, as v , you do not use dim3 grid,threads . Anyway I understand there are N blocks with only one thread so your kernel do not benefit of coalesced memory access, which may be the main reason why the cuda version is slower than the cpu version Try

VectorOnNumber<<<N/32+1,32>>>(pointerToVector, 10, pointerToVector);

here is my code: GPU kernel:

void VectorOnNumber(double *vector1, double number, double *resultVector,int N)
{
    dim3 grid(N/256+1),threads(256);

    VectorOnNumber_K<<<grid,threads>>>(vector1, number, resultVector,N);
}

__global__
void VectorOnNumber_K(double *vector1, double number, double *resultVector,int N)
{
    int tid = threadIdx.x + blockIdx.x*blockDim.x;
    if(tid < N){
        resultVector[tid] = vector1[tid]*number;
    }
}


void VectorOnNumberf(float *vector1, float number, float *resultVector,int N)
{
    dim3 grid(N/256+1),threads(256);

    VectorOnNumberf_K<<<grid,threads>>>(vector1, number, resultVector,N);
}

__global__
void VectorOnNumberf_K(float *vector1, float number, float *resultVector,int N)
{
    int tid = threadIdx.x + blockIdx.x*blockDim.x;
    if(tid < N){
        resultVector[tid] = vector1[tid]*number;
    }
}


CPU :

void Stack_von(double *vec, double n, double *res,int N)
{
int i;
    for(i = 0; i < N; i++){
        res[i] = vec[i]*n;
    }
}
void Stack_vonf(float *vec, float n, float *res,int N)
{
int i;
    for(i = 0; i < N; i++){
        res[i] = vec[i]*n;
    }
}

full test:

void Stack()
{
int i,N;
double *x,*u,*dx,*du;
float  *fx,*fu,*dfx,*dfu;

    N=1000000;
    x=new double[N];
    u=new double[N];
    fx=new float[N];
    fu=new float[N];

    
    for(i = 0; i < N; i++){
        x[i] = i*i;
        fx[i] = i*i;
    }

    // cpu
    printf("start\n");
    clock_t start = clock();
    for(int k=0; k < 1000; k++) Stack_von(x, 10, u,N);
    clock_t end = clock();
    float seconds = (float)(end - start) / CLOCKS_PER_SEC;
    printf("host double %f ms\n", seconds); 

    start = clock();
    for(int k=0; k < 1000; k++) Stack_vonf(fx, 10, fu,N);
    end = clock();
    seconds = (float)(end - start) / CLOCKS_PER_SEC;
    printf("host float  %f ms\n", seconds); // 0.03 ms
    
    // gpu
    cudaMalloc(&dfx, N*sizeof(float));
    cudaMalloc(&dfu, N*sizeof(float));
    cudaMemcpy(dfx, fx, N*sizeof(float), cudaMemcpyHostToDevice);
    cudaMalloc(&dx, N*sizeof(double));
    cudaMalloc(&du, N*sizeof(double));
    cudaMemcpy(dx, x, N*sizeof(double), cudaMemcpyHostToDevice);

    cudaEvent_t dstart, dstop;
    float elapsedTime;
    cudaEventCreate(&dstart);
    cudaEventCreate(&dstop);

    cudaEventRecord(dstart,0);
    VectorOnNumber(dx, 10, du,N);
    cudaEventRecord(dstop,0);
    cudaEventSynchronize(dstop);
    cudaEventElapsedTime(&elapsedTime, dstart,dstop);
    printf("device double %f ms\n" ,elapsedTime);

    cudaEventRecord(dstart,0);
    VectorOnNumberf(dfx, 10, dfu,N);
    cudaEventRecord(dstop,0);
    cudaEventSynchronize(dstop);
    cudaEventElapsedTime(&elapsedTime, dstart,dstop);
    printf("device float  %f ms\n" ,elapsedTime);
     

    cudaFree(dx);
    cudaFree(du);
    cudaFree(dfx);
    cudaFree(dfu);
    delete [] x;
    delete [] u;
    delete [] fx;
    delete [] fu;
}

results host double:1.35ms float 0.45ms device double 0.067ms float 0.037ms

device (GTX1080) is 10x faster than host (XEON 3.50GHz 8 cores) I set N to 10^6 to make it measurable

BUT time is 1.37ms on device if there is only one thread per block !

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM