[英]OpenCL kernel executes correctly only once
我写了一个内核,它只能正确执行一次! 我正在尝试执行以下内核代码:
__kernel void calculate2dim(__global int *g_idata,
__local int *sdata, __global int *g_odata) {
unsigned int tid = get_local_id(0);
unsigned int i = get_global_id(0);
sdata[tid] = g_idata[i];
barrier(CLK_LOCAL_MEM_FENCE);
for(unsigned int s = 1; s < get_local_size(0); s *= 2) {
if(tid % (2*s) == 0) {
sdata[tid] += sdata[tid + s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if(tid == 0)
g_odata[get_group_id(0)] = sdata[0];
}
这是宿主代码的片段(请记住,这是草稿,尚未编写完美的代码):
cl_int err;
size_t global; // global domain size for our calculation
size_t local; // local domain size for our calculation
int* data = new int[200];
int *res = new int[200];
int result = 0;
for(int i = 0; i < 200; i++) {
data[i] = i % 14;
res[i] = 0;
}
unsigned int correct; // number of correct results returned
cl_mem input1; // device memory used for the input array
cl_mem output;
unsigned int count = 200;
input1 = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int) * count, NULL, NULL);
output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int) * count, NULL, NULL);
if (!input1)
{
cerr << "Error: Failed to allocate device memory!" << endl;
exit(1);
}
err = clEnqueueWriteBuffer(commands, input1,
CL_TRUE, 0, sizeof(int) * count,
data, 0, NULL, NULL);
if (err != CL_SUCCESS)
{
cerr << "Error: Failed to write to source array!" << endl;
exit(1);
}
err = 0;
err = clSetKernelArg(kernel_av, 0, sizeof(cl_mem), &input1);
err |= clSetKernelArg(kernel_av, 1, count * sizeof(int), NULL);
err |= clSetKernelArg(kernel_av, 2, sizeof(cl_mem), &output);
if (err != CL_SUCCESS)
{
cerr << "Error: Failed to set kernel arguments! " << err << endl;
exit(1);
}
err = clGetKernelWorkGroupInfo(kernel_av, device_id,
CL_KERNEL_WORK_GROUP_SIZE,
sizeof(local), &local, NULL);
if (err != CL_SUCCESS)
{
cerr << "Error: Failed to retrieve kernel work group info! "
<< err << endl;
exit(1);
}
global = count;
if(local > global) {
local = global;
}
cout << "global = " << global << " local = " << local << endl;
local = 200;
err = clEnqueueNDRangeKernel(commands, kernel_av,
1, NULL, &global, &local,
0, NULL, NULL);
if (err)
{
cerr << "Error: Failed to execute kernel!" << endl;
return EXIT_FAILURE;
}
// Wait for all commands to complete
clFinish(commands);
// Read back the results from the device to verify the output
//
err = clEnqueueReadBuffer(commands, output,
CL_TRUE, 0, sizeof(int) * count,
res, 0, NULL, NULL );
if (err != CL_SUCCESS)
{
cerr << "Error: Failed to read output array! " << err << endl;
exit(1);
}
cout << res[0] << endl;
int x = 0;
for(int i = 0; i < 200; i++) {
x += data[i];
cout << res[i] << endl;
}
cout << x << endl;
// Shutdown and cleanup
delete [] data;
delete [] res;
clReleaseMemObject(input1);
clReleaseMemObject(output);
第一次执行后,我在主机代码中得到了正确的答案1280,但是随后(第二次执行等),我得到了类似于1073743104的信息,而不是正确的答案。 也许我忘了清理一些变量或其他东西?
谢谢!
终于我明白了!
使用此算法(并行减少树)时,请记住,线程数应为2的幂!
否则(我们取10个数字)将是这样的:
步骤0:1 2 3 4 5 6 7 8 9 10
步骤1: 3 2 7 4 11 6 15 8 19 10
步骤2:10(3 + 7) 2 7 4 18(11 + 15) 6 15 8 ??? 10 //因为每个步骤上的活动线程数应该是偶数!
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.