[英]Sum array of vectors with cuda
I need to find average for thousands (20,000+) images represented by unsigned short arrays. 我需要找到由无符号短阵列表示的数千(20,000+)个图像的平均值。 Could you please check me, it looks for me that this code is not optimal: 你可以检查一下,它找我这个代码不是最优的:
my kernel: 我的内核:
__global__ void VecAdd(unsigned short *A, float *B, unsigned int Size, float div){
register float divider = div;
register int idx = threadIdx.x + blockIdx.x * blockDim.x;
if ( idx < Size) {
B[ idx ] = (float) A[idx] / divider + B[idx];
}
//__syncthreads();
}
kernel wrapper: 内核包装器:
void kernel_wrapper(unsigned short* pixels1, float* pixels2, unsigned int length, float div)
{
unsigned short* deviceData1;
float* deviceData2;
cudaMalloc((void**)&deviceData1, length * sizeof(unsigned short));
cudaMalloc((void**)&deviceData2, length * sizeof(float));
cudaMemcpy(deviceData1, pixels1, length * sizeof(unsigned short), cudaMemcpyHostToDevice);
cudaMemcpy(deviceData2, pixels2, length * sizeof(float), cudaMemcpyHostToDevice);
int threads = 1024; //my maximum
int blocks = (length / threads); // lenght=1280*960 -> blocks=1200
VecAdd<<< blocks, threads >>>( deviceData1, deviceData2, length, div );
cudaMemcpy(pixels2, deviceData2, length * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree( deviceData1 );
cudaFree( deviceData2 );
}`
and I do 而我呢
float* avrg2f = (float*)malloc( width * height * sizeof(float));
memset( avrg2f, 0.0, sizeof(float) * width * height);
for (int k = 0; k < count; k++) {
imageObjectList.at( curObj )->getImage( k );
kernel_wrapper( avrg1, avrg2f, height * width, (float)count);
}
as result may averaged image will be in avrg2f; 结果可能是平均图像将在avrg2f;
Thank you. 谢谢。
If the images are all the same size, then your wrapper function need not do cudaMalloc
and cudaFree
operations on every call. 如果图像大小相同,那么你的包装函数不需要在每次调用时都执行cudaMalloc
和cudaFree
操作。
Pre-allocate that storage needed, and don't allocated and free it on every call to the wrapper. 预先分配所需的存储空间,并且不会在每次调用包装器时分配和释放它。
In addition you may see something like a ~2x speedup (for the cudaMemcpy
operations) if you use pinned allocations ( cudaHostAlloc
) on the host side for your image storage. 此外,如果您在主机端使用固定分配( cudaHostAlloc
)进行映像存储,您可能会看到类似~2倍加速(对于cudaMemcpy
操作)。
Finally, for the duration of your loop, there's no need to copy the results back to the host. 最后,在循环期间,无需将结果复制回主机。 Do this after you're done computing the average. 完成计算平均值后执行此操作。 This will save 2 out of the 3 cudaMemcpy
operations you are doing in the wrapper. 这将节省您在包装器中执行的3个cudaMemcpy
操作中的2个。
While we're at it, in my opinion using memset
to initialize a float
array is questionable. 虽然我们正在使用它,但在我看来,使用memset
来初始化一个float
数组是值得怀疑的。 It works for a zero value, but essentially no other. 它适用于零值,但基本上没有其他值。 Furthermore, I would expect passing 0.0
as the second parameter to memset
to at least throw a compiler warning. 此外,我希望将0.0
作为memset
的第二个参数传递给至少抛出编译器警告。
The following code shows the above optimizations, and demonstrates about an 8x speedup over your code in my test case: 以下代码显示了上述优化,并演示了在我的测试用例中代码速度提高了8倍:
#include <stdio.h>
#include <sys/time.h>
#include <time.h>
__global__ void VecAdd(unsigned short *A, float *B, unsigned int Size, float div){
register float divider = div;
register int idx = threadIdx.x + blockIdx.x * blockDim.x;
if ( idx < Size) {
B[ idx ] = (float) A[idx] / divider + B[idx];
}
//__syncthreads();
}
__global__ void VecAdd2(unsigned short *A, float *B, unsigned int Size, float mult){
register int idx = threadIdx.x + blockIdx.x * blockDim.x;
if ( idx < Size) {
B[ idx ] = (float) A[idx] * mult + B[idx];
}
}
void kernel_wrapper(unsigned short* pixels1, float* pixels2, unsigned int length, float div)
{
unsigned short* deviceData1;
float* deviceData2;
cudaMalloc((void**)&deviceData1, length * sizeof(unsigned short));
cudaMalloc((void**)&deviceData2, length * sizeof(float));
cudaMemcpy(deviceData1, pixels1, length * sizeof(unsigned short), cudaMemcpyHostToDevice);
cudaMemcpy(deviceData2, pixels2, length * sizeof(float), cudaMemcpyHostToDevice);
int threads = 1024; //my maximum
int blocks = (length / threads); // lenght=1280*960 -> blocks=1200
VecAdd<<< blocks, threads >>>( deviceData1, deviceData2, length, div );
cudaMemcpy(pixels2, deviceData2, length * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree( deviceData1 );
cudaFree( deviceData2 );
}
void kernel_wrapper2(unsigned short* h_pixels1, unsigned short* d_pixels1, float* d_pixels2, unsigned int length, float my_mult)
{
cudaMemcpy(d_pixels1, h_pixels1, length * sizeof(unsigned short), cudaMemcpyHostToDevice);
int threads = 1024; //my maximum
int blocks = (length / threads); // lenght=1280*960 -> blocks=1200
VecAdd2<<< blocks, threads >>>( d_pixels1, d_pixels2, length, my_mult );
}
int main(){
const int count = 2000;
const int width = 1280;
const int height = 960;
timeval t1, t2;
unsigned long et;
unsigned short *h1_image;
h1_image = (unsigned short *)malloc(height*width*sizeof(unsigned short));
float* avrg2f = (float*)malloc( width * height * sizeof(float));
for (int i = 0; i<height*width; i++){
h1_image[i] = (i%256);
avrg2f[i] = 0.0f;
}
gettimeofday(&t1,NULL);
for (int k = 0; k < count; k++) {
kernel_wrapper( h1_image, avrg2f, height * width, (float)count);
}
gettimeofday(&t2,NULL);
et = ((t2.tv_sec * 1000000)+t2.tv_usec) - ((t1.tv_sec * 1000000) + t1.tv_usec);
printf("time 1 = %ld us\n", et);
unsigned short *h2_image;
float* avrg3f = (float*)malloc( width * height * sizeof(float));
cudaHostAlloc((void **)&h2_image, height*width*sizeof(unsigned short), cudaHostAllocDefault);
for (int i = 0; i<height*width; i++){
h2_image[i] = (i%256);
avrg3f[i] = 0.0f;
}
gettimeofday(&t1,NULL);
unsigned short *d_image;
float *d_result;
cudaMalloc((void **)&d_image, height*width*sizeof(unsigned short));
cudaMalloc((void **)&d_result, height*width*sizeof(float));
cudaMemcpy(d_result, avrg3f, height*width*sizeof(float), cudaMemcpyHostToDevice);
for (int k = 0; k < count; k++) {
kernel_wrapper2( h2_image, d_image, d_result, height * width, (float)(1/(float)count));
}
cudaMemcpy(avrg3f, d_result, height*width*sizeof(float), cudaMemcpyDeviceToHost);
gettimeofday(&t2,NULL);
et = ((t2.tv_sec * 1000000)+t2.tv_usec) - ((t1.tv_sec * 1000000) + t1.tv_usec);
printf("time 2 = %ld us\n", et);
for (int i = 0; i < (height*width); i++)
if (fabs(avrg2f[i] - avrg3f[i]) > 0.0001) {printf("mismatch at %d, 1 = %f, 2 = %f\n", i, avrg2f[i], avrg3f[i]); return 1;}
return 0;
}
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.