openCL的约简算法实现

Question

I have the following "addition of 2^20 floats" implemented on OpenCL using two different kernels: One is supposed to treat values on scalar basis and the other will use float4 vectors. 我在OpenCL上使用以下两种不同的内核实现了以下“ 2 ^ 20浮点数加法”：一种应按标量处理值，另一种将使用float4向量。 While the code builds without any compilation error, the program doesn't run. 尽管代码生成时没有任何编译错误，但是程序不会运行。 I've tried the code on both NVIDIA and AMD platforms and in both cases it fails. 我已经在NVIDIA和AMD平台上尝试了该代码，但在两种情况下均失败了。 When I debug the code it gives the error of stack overflow. 当我调试代码时，它给出了堆栈溢出的错误。 This code is a sample code of OpenCL in Action book. 该代码是《行动手册》中OpenCL的示例代码。 Any ideas why the code doesn't run? 有什么想法为什么代码无法运行？ Here is the code: 这是代码：

#define _CRT_SECURE_NO_WARNINGS
#define PROGRAM_FILE "reduction.cl"

#define ARRAY_SIZE 1048576
#define NUM_KERNELS 2

#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

#ifdef MAC
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif

/* Find a GPU or CPU associated with the first available platform */
cl_device_id create_device() {

    cl_platform_id platform;
    cl_device_id dev;
    int err;

    /* Identify a platform */
    err = clGetPlatformIDs(1, &platform, NULL);
    if (err < 0) {
        perror("Couldn't identify a platform");
        exit(1);
    }

    /* Access a device */
    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &dev, NULL);
    if (err == CL_DEVICE_NOT_FOUND) {
        printf(" GPU is not first! Going on CPU :(");
        err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &dev, NULL);
    }
    if (err < 0) {
        perror("Couldn't access any devices");
        exit(1);
    }

    return dev;
}

/* Create program from a file and compile it */
cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) {

    cl_program program;
    FILE *program_handle;
    char *program_buffer, *program_log;
    size_t program_size, log_size;
    int err;

    /* Read program file and place content into buffer */
    program_handle = fopen(filename, "r");
    if (program_handle == NULL) {
        perror("Couldn't find the program file");
        exit(1);
    }
    fseek(program_handle, 0, SEEK_END);
    program_size = ftell(program_handle);
    rewind(program_handle);
    program_buffer = (char*)malloc(program_size + 1);
    program_buffer[program_size] = '\0';
    fread(program_buffer, sizeof(char), program_size, program_handle);
    fclose(program_handle);

    /* Create program from file */
    program = clCreateProgramWithSource(ctx, 1,
        (const char**)&program_buffer, &program_size, &err);
    if (err < 0) {
        perror("Couldn't create the program");
        exit(1);
    }
    free(program_buffer);

    /* Build program */
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err < 0) {

        /* Find size of log and print to std output */
        clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
            0, NULL, &log_size);
        program_log = (char*)malloc(log_size + 1);
        program_log[log_size] = '\0';
        clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
            log_size + 1, program_log, NULL);
        printf("%s\n", program_log);
        free(program_log);
        exit(1);
    }

    return program;
}

int main() {

    /* OpenCL structures */
    cl_device_id device;
    cl_context context;
    cl_program program;
    cl_kernel kernel[NUM_KERNELS];
    cl_command_queue queue;
    cl_event prof_event;
    cl_int i, j, err;
    size_t local_size, global_size;
    char kernel_names[NUM_KERNELS][20] =
    { "reduction_scalar", "reduction_vector" };

    /* Data and buffers */
    float data[ARRAY_SIZE];
    float sum, actual_sum, *scalar_sum, *vector_sum;
    cl_mem data_buffer, scalar_sum_buffer, vector_sum_buffer;
    cl_int num_groups;
    cl_ulong time_start, time_end, total_time;

    /* Initialize data */
    for (i = 0; i<ARRAY_SIZE; i++) {
        data[i] = 1.0f*i;
    }

    /* Create device and determine local size */
    device = create_device();
    err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE,
        sizeof(local_size), &local_size, NULL);
    if (err < 0) {
        perror("Couldn't obtain device information");
        exit(1);
    }

    /* Allocate and initialize output arrays */
    num_groups = ARRAY_SIZE / local_size;
    scalar_sum = (float*)malloc(num_groups * sizeof(float));
    vector_sum = (float*)malloc(num_groups / 4 * sizeof(float));
    for (i = 0; i<num_groups; i++) {
        scalar_sum[i] = 0.0f;
    }
    for (i = 0; i<num_groups / 4; i++) {
        vector_sum[i] = 0.0f;
    }

    /* Create a context */
    context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
    if (err < 0) {
        perror("Couldn't create a context");
        exit(1);
    }

    /* Build program */
    program = build_program(context, device, PROGRAM_FILE);

    /* Create data buffer */
    data_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY |
        CL_MEM_COPY_HOST_PTR, ARRAY_SIZE * sizeof(float), data, &err);
    scalar_sum_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE |
        CL_MEM_COPY_HOST_PTR, num_groups * sizeof(float), scalar_sum, &err);
    vector_sum_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE |
        CL_MEM_COPY_HOST_PTR, num_groups * sizeof(float), vector_sum, &err);
    if (err < 0) {
        perror("Couldn't create a buffer");
        exit(1);
    };

    /* Create a command queue */
    queue = clCreateCommandQueue(context, device,
        CL_QUEUE_PROFILING_ENABLE, &err);
    if (err < 0) {
        perror("Couldn't create a command queue");
        exit(1);
    };

    for (i = 0; i<NUM_KERNELS; i++) {

        /* Create a kernel */
        kernel[i] = clCreateKernel(program, kernel_names[i], &err);
        if (err < 0) {
            perror("Couldn't create a kernel");
            exit(1);
        };

        /* Create kernel arguments */
        err = clSetKernelArg(kernel[i], 0, sizeof(cl_mem), &data_buffer);
        if (i == 0) {
            global_size = ARRAY_SIZE;
            err |= clSetKernelArg(kernel[i], 1, local_size * sizeof(float), NULL);
            err |= clSetKernelArg(kernel[i], 2, sizeof(cl_mem), &scalar_sum_buffer);
        }
        else {
            global_size = ARRAY_SIZE / 4;
            err |= clSetKernelArg(kernel[i], 1, local_size * 4 * sizeof(float), NULL);
            err |= clSetKernelArg(kernel[i], 2, sizeof(cl_mem), &vector_sum_buffer);
        }
        if (err < 0) {
            perror("Couldn't create a kernel argument");
            exit(1);
        }

        /* Enqueue kernel */
        err = clEnqueueNDRangeKernel(queue, kernel[i], 1, NULL, &global_size,
            &local_size, 0, NULL, &prof_event);
        if (err < 0) {
            perror("Couldn't enqueue the kernel");
            exit(1);
        }

        /* Finish processing the queue and get profiling information */
        clFinish(queue);
        clGetEventProfilingInfo(prof_event, CL_PROFILING_COMMAND_START,
            sizeof(time_start), &time_start, NULL);
        clGetEventProfilingInfo(prof_event, CL_PROFILING_COMMAND_END,
            sizeof(time_end), &time_end, NULL);
        total_time = time_end - time_start;

        /* Read the result */
        if (i == 0) {
            err = clEnqueueReadBuffer(queue, scalar_sum_buffer, CL_TRUE, 0,
                num_groups * sizeof(float), scalar_sum, 0, NULL, NULL);
            if (err < 0) {
                perror("Couldn't read the buffer");
                exit(1);
            }
            sum = 0.0f;
            for (j = 0; j<num_groups; j++) {
                sum += scalar_sum[j];
            }
        }
        else {
            err = clEnqueueReadBuffer(queue, vector_sum_buffer, CL_TRUE, 0,
                num_groups / 4 * sizeof(float), vector_sum, 0, NULL, NULL);
            if (err < 0) {
                perror("Couldn't read the buffer");
                exit(1);
            }
            sum = 0.0f;
            for (j = 0; j<num_groups / 4; j++) {
                sum += vector_sum[j];
            }
        }

        /* Check result */
        printf("%s: ", kernel_names[i]);
        actual_sum = 1.0f * ARRAY_SIZE / 2 * (ARRAY_SIZE - 1);
        if (fabs(sum - actual_sum) > 0.01*fabs(sum))
            printf("Check failed.\n");
        else
            printf("Check passed.\n");
        printf("Total time = %lu\n\n", total_time);

        /* Deallocate event */
        clReleaseEvent(prof_event);
    }

    /* Deallocate resources */
    free(scalar_sum);
    free(vector_sum);
    for (i = 0; i<NUM_KERNELS; i++) {
        clReleaseKernel(kernel[i]);
    }
    clReleaseMemObject(scalar_sum_buffer);
    clReleaseMemObject(vector_sum_buffer);
    clReleaseMemObject(data_buffer);
    clReleaseCommandQueue(queue);
    clReleaseProgram(program);
    clReleaseContext(context);
    return 0;
}

and here is the kernels which are stored in "reduction.cl" file: 这是存储在“ reduction.cl”文件中的内核：

__kernel void reduction_scalar(__global float* data, 
      __local float* partial_sums, __global float* output) {

   int lid = get_local_id(0);
   int group_size = get_local_size(0);

   partial_sums[lid] = data[get_global_id(0)];
   barrier(CLK_LOCAL_MEM_FENCE);

   for(int i = group_size/2; i>0; i >>= 1) {
      if(lid < i) {
         partial_sums[lid] += partial_sums[lid + i];
      }
      barrier(CLK_LOCAL_MEM_FENCE);
   }

   if(lid == 0) {
      output[get_group_id(0)] = partial_sums[0];
   }
}

__kernel void reduction_vector(__global float4* data, 
      __local float4* partial_sums, __global float* output) {

   int lid = get_local_id(0);
   int group_size = get_local_size(0);

   partial_sums[lid] = data[get_global_id(0)];
   barrier(CLK_LOCAL_MEM_FENCE);

   for(int i = group_size/2; i>0; i >>= 1) {
      if(lid < i) {
         partial_sums[lid] += partial_sums[lid + i];
      }
      barrier(CLK_LOCAL_MEM_FENCE);
   }

   if(lid == 0) {
      output[get_group_id(0)] = dot(partial_sums[0], (float4)(1.0f));
   }
}

Answer 1

The size of data is too big. data量太大。 You need to dynamically allocate it by using malloc . 您需要使用malloc动态分配它。 Change of 的变化

float data[ARRAY_SIZE];

to 至

float *data = (float *)malloc(sizeof(float)* ARRAY_SIZE);

has solved the problem. 解决了问题。

openCL的约简算法实现

问题描述

1 个解决方案

解决方案1
1 已采纳 2015-12-04 14:55:46

openCL的约简算法实现

问题描述

1 个解决方案

解决方案1 1 已采纳 2015-12-04 14:55:46

解决方案1
1 已采纳 2015-12-04 14:55:46