简体   繁体   English

OpenCL内核只能正确执行一次

[英]OpenCL kernel executes correctly only once

I wrote a kernel and it executes correctly only once! 我写了一个内核,它只能正确执行一次! I am trying to execute this kernel code: 我正在尝试执行以下内核代码:

__kernel void calculate2dim(__global int *g_idata, 
__local int *sdata, __global int *g_odata) {

unsigned int tid = get_local_id(0);
unsigned int i = get_global_id(0);
sdata[tid] = g_idata[i];
barrier(CLK_LOCAL_MEM_FENCE);

for(unsigned int s = 1; s < get_local_size(0); s *= 2) {
    if(tid % (2*s) == 0) {
        sdata[tid] += sdata[tid + s];
    }
    barrier(CLK_LOCAL_MEM_FENCE);
}

if(tid == 0)
    g_odata[get_group_id(0)] = sdata[0];
}

This is the fragment of the host code (please, keep in mind, that it is a draft, not perfectly written code yet): 这是宿主代码的片段(请记住,这是草稿,尚未编写完美的代码):

cl_int err;
size_t global;  // global domain size for our calculation
size_t local;   // local domain size for our calculation

int* data = new int[200];
int *res = new int[200];
int result = 0;

for(int i = 0; i < 200; i++) {
    data[i] = i % 14;
    res[i] = 0;
}

unsigned int correct;               // number of correct results returned
cl_mem input1;                       // device memory used for the input array
cl_mem output;

unsigned int count = 200;

input1 = clCreateBuffer(context,  CL_MEM_READ_ONLY, sizeof(int) * count, NULL, NULL);
output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int) * count, NULL, NULL);

if (!input1)
{
    cerr << "Error: Failed to allocate device memory!" << endl;
    exit(1);
}

err = clEnqueueWriteBuffer(commands, input1,
                           CL_TRUE, 0, sizeof(int) * count,
                           data, 0, NULL, NULL);

if (err != CL_SUCCESS)
{
    cerr << "Error: Failed to write to source array!" << endl;
    exit(1);
}

err = 0;
err  = clSetKernelArg(kernel_av, 0, sizeof(cl_mem), &input1);
err |= clSetKernelArg(kernel_av, 1, count * sizeof(int), NULL);
err |= clSetKernelArg(kernel_av, 2, sizeof(cl_mem), &output);
if (err != CL_SUCCESS)
{
    cerr << "Error: Failed to set kernel arguments! " << err << endl;
    exit(1);
}

err = clGetKernelWorkGroupInfo(kernel_av, device_id,
                               CL_KERNEL_WORK_GROUP_SIZE,
                               sizeof(local), &local, NULL);
if (err != CL_SUCCESS)
{
    cerr << "Error: Failed to retrieve kernel work group info! "
         <<  err << endl;
    exit(1);
}

global = count;

if(local > global) {
    local = global;
}
cout << "global = " << global << " local = " << local << endl;
local = 200;

err = clEnqueueNDRangeKernel(commands, kernel_av,
                             1, NULL, &global, &local,
                             0, NULL, NULL);
if (err)
{
    cerr << "Error: Failed to execute kernel!" << endl;
    return EXIT_FAILURE;
}

// Wait for all commands to complete
clFinish(commands);

// Read back the results from the device to verify the output
//
err = clEnqueueReadBuffer(commands, output,
                           CL_TRUE, 0, sizeof(int) * count,
                           res, 0, NULL, NULL );

if (err != CL_SUCCESS)
{
    cerr << "Error: Failed to read output array! " <<  err << endl;
    exit(1);
}

cout << res[0] << endl;

int x = 0;
for(int i = 0; i < 200; i++) {
    x += data[i];
    cout << res[i] << endl;
}
cout << x << endl;

// Shutdown and cleanup
delete [] data;
delete [] res;

clReleaseMemObject(input1);
clReleaseMemObject(output);

After the first execution I have the correct answer 1280 in the host code, but then (second execution and some others) I get something like 1073743104 instead of the correct answer. 第一次执行后,我在主机代码中得到了正确的答案1280,但是随后(第二次执行等),我得到了类似于1073743104的信息,而不是正确的答案。 Maybe I forget to clean some variables or something? 也许我忘了清理一些变量或其他东西?

Thank you! 谢谢!

Finally I got it! 终于我明白了!

When you use this algorithm (Parallel Reduction Tree), you should keep in mind, that the number of threads should be the power of 2! 使用此算法(并行减少树)时,请记住,线程数应为2的幂!

Otherwise (let's take 10 numbers) it will be like this: 否则(我们取10个数字)将是这样的:

Step 0: 1 2 3 4 5 6 7 8 9 10 步骤0:1 2 3 4 5 6 7 8 9 10

Step 1: 3 2 7 4 11 6 15 8 19 10 步骤1: 3 2 7 4 11 6 15 8 19 10

Step 2: 10 (3 + 7) 2 7 4 18 (11 + 15) 6 15 8 ??? 步骤2:10(3 + 7) 2 7 4 18(11 + 15) 6 15 8 ??? 10 //because number of active threads on each step should be even! 10 //因为每个步骤上的活动线程数应该是偶数!

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM