简体   繁体   English

与cuda的向量和数组

[英]Sum array of vectors with cuda

I need to find average for thousands (20,000+) images represented by unsigned short arrays. 我需要找到由无符号短阵列表示的数千(20,000+)个图像的平均值。 Could you please check me, it looks for me that this code is not optimal: 你可以检查一下,它找我这个代码不是最优的:

my kernel: 我的内核:

__global__ void VecAdd(unsigned short *A, float *B,  unsigned int Size, float div){

  register float divider = div;
  register int idx = threadIdx.x + blockIdx.x * blockDim.x;

  if ( idx < Size) {
   B[ idx ] = (float) A[idx] /  divider + B[idx];
  }
  //__syncthreads();
}

kernel wrapper: 内核包装器:

void kernel_wrapper(unsigned short* pixels1, float* pixels2,  unsigned int length, float div)
{
    unsigned short* deviceData1;
    float* deviceData2;

    cudaMalloc((void**)&deviceData1, length * sizeof(unsigned short));
    cudaMalloc((void**)&deviceData2, length * sizeof(float));

    cudaMemcpy(deviceData1, pixels1, length * sizeof(unsigned short), cudaMemcpyHostToDevice);
    cudaMemcpy(deviceData2, pixels2, length * sizeof(float), cudaMemcpyHostToDevice);

    int  threads = 1024; //my maximum
    int blocks = (length / threads); // lenght=1280*960 -> blocks=1200

    VecAdd<<< blocks, threads >>>( deviceData1, deviceData2,  length, div );

    cudaMemcpy(pixels2, deviceData2, length * sizeof(float), cudaMemcpyDeviceToHost);

    cudaFree( deviceData1 );
    cudaFree( deviceData2 );
    }`

and I do 而我呢

float* avrg2f = (float*)malloc( width * height * sizeof(float));
memset( avrg2f, 0.0, sizeof(float) * width * height);

for (int k = 0; k < count; k++) {           
 imageObjectList.at( curObj )->getImage( k );
 kernel_wrapper( avrg1, avrg2f, height * width, (float)count);  
}

as result may averaged image will be in avrg2f; 结果可能是平均图像将在avrg2f;

Thank you. 谢谢。

If the images are all the same size, then your wrapper function need not do cudaMalloc and cudaFree operations on every call. 如果图像大小相同,那么你的包装函数不需要在每次调用时都执行cudaMalloccudaFree操作。

Pre-allocate that storage needed, and don't allocated and free it on every call to the wrapper. 预先分配所需的存储空间,并且不会在每次调用包装器时分配和释放它。

In addition you may see something like a ~2x speedup (for the cudaMemcpy operations) if you use pinned allocations ( cudaHostAlloc ) on the host side for your image storage. 此外,如果您在主机端使用固定分配( cudaHostAlloc )进行映像存储,您可能会看到类似~2倍加速(对于cudaMemcpy操作)。

Finally, for the duration of your loop, there's no need to copy the results back to the host. 最后,在循环期间,无需将结果复制回主机。 Do this after you're done computing the average. 完成计算平均值后执行此操作。 This will save 2 out of the 3 cudaMemcpy operations you are doing in the wrapper. 这将节省您在包装器中执行的3个cudaMemcpy操作中的2个。

While we're at it, in my opinion using memset to initialize a float array is questionable. 虽然我们正在使用它,但在我看来,使用memset来初始化一个float数组是值得怀疑的。 It works for a zero value, but essentially no other. 它适用于零值,但基本上没有其他值。 Furthermore, I would expect passing 0.0 as the second parameter to memset to at least throw a compiler warning. 此外,我希望将0.0作为memset的第二个参数传递给至少抛出编译器警告。

The following code shows the above optimizations, and demonstrates about an 8x speedup over your code in my test case: 以下代码显示了上述优化,并演示了在我的测试用例中代码速度提高了8倍:

#include <stdio.h>
#include <sys/time.h>
#include <time.h>

__global__ void VecAdd(unsigned short *A, float *B,  unsigned int Size, float div){

  register float divider = div;
  register int idx = threadIdx.x + blockIdx.x * blockDim.x;

  if ( idx < Size) {
   B[ idx ] = (float) A[idx] /  divider + B[idx];
  }
  //__syncthreads();
}

__global__ void VecAdd2(unsigned short *A, float *B,  unsigned int Size, float mult){

  register int idx = threadIdx.x + blockIdx.x * blockDim.x;

  if ( idx < Size) {
   B[ idx ] = (float) A[idx] * mult + B[idx];
  }
}

void kernel_wrapper(unsigned short* pixels1, float* pixels2,  unsigned int length, float div)
{
    unsigned short* deviceData1;
    float* deviceData2;

    cudaMalloc((void**)&deviceData1, length * sizeof(unsigned short));
    cudaMalloc((void**)&deviceData2, length * sizeof(float));

    cudaMemcpy(deviceData1, pixels1, length * sizeof(unsigned short), cudaMemcpyHostToDevice);
    cudaMemcpy(deviceData2, pixels2, length * sizeof(float), cudaMemcpyHostToDevice);

    int  threads = 1024; //my maximum
    int blocks = (length / threads); // lenght=1280*960 -> blocks=1200

    VecAdd<<< blocks, threads >>>( deviceData1, deviceData2,  length, div );

    cudaMemcpy(pixels2, deviceData2, length * sizeof(float), cudaMemcpyDeviceToHost);

    cudaFree( deviceData1 );
    cudaFree( deviceData2 );
    }
void kernel_wrapper2(unsigned short* h_pixels1, unsigned short* d_pixels1, float* d_pixels2,  unsigned int length, float my_mult)
{


    cudaMemcpy(d_pixels1, h_pixels1, length * sizeof(unsigned short), cudaMemcpyHostToDevice);

    int  threads = 1024; //my maximum
    int blocks = (length / threads); // lenght=1280*960 -> blocks=1200

    VecAdd2<<< blocks, threads >>>( d_pixels1, d_pixels2,  length, my_mult );

    }

int main(){

  const int count = 2000;
  const int width = 1280;
  const int height = 960;
  timeval t1, t2;
  unsigned long et;

  unsigned short *h1_image;
  h1_image = (unsigned short *)malloc(height*width*sizeof(unsigned short));

  float* avrg2f = (float*)malloc( width * height * sizeof(float));
  for (int i = 0; i<height*width; i++){
    h1_image[i] = (i%256);
    avrg2f[i] = 0.0f;
    }

  gettimeofday(&t1,NULL);
  for (int k = 0; k < count; k++) {
    kernel_wrapper( h1_image, avrg2f, height * width, (float)count);
  }
  gettimeofday(&t2,NULL);
  et = ((t2.tv_sec * 1000000)+t2.tv_usec) - ((t1.tv_sec * 1000000) + t1.tv_usec);
  printf("time 1 = %ld us\n", et);
  unsigned short *h2_image;
  float* avrg3f = (float*)malloc( width * height * sizeof(float));
  cudaHostAlloc((void **)&h2_image, height*width*sizeof(unsigned short), cudaHostAllocDefault);
  for (int i = 0; i<height*width; i++){
    h2_image[i] = (i%256);
    avrg3f[i] = 0.0f;
    }
  gettimeofday(&t1,NULL);
  unsigned short *d_image;
  float *d_result;
  cudaMalloc((void **)&d_image, height*width*sizeof(unsigned short));
  cudaMalloc((void **)&d_result, height*width*sizeof(float));
  cudaMemcpy(d_result, avrg3f, height*width*sizeof(float), cudaMemcpyHostToDevice);
  for (int k = 0; k < count; k++) {
    kernel_wrapper2( h2_image, d_image,  d_result, height * width, (float)(1/(float)count));
  }
  cudaMemcpy(avrg3f, d_result, height*width*sizeof(float), cudaMemcpyDeviceToHost);
  gettimeofday(&t2,NULL);
  et = ((t2.tv_sec * 1000000)+t2.tv_usec) - ((t1.tv_sec * 1000000) + t1.tv_usec);
  printf("time 2 = %ld us\n", et);
  for (int i = 0; i < (height*width); i++)
    if (fabs(avrg2f[i] - avrg3f[i]) > 0.0001) {printf("mismatch at %d, 1 = %f, 2 = %f\n", i, avrg2f[i], avrg3f[i]); return 1;}
  return 0;
}

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM