简体   繁体   English

OpenCL,C ++:简单求和浮点矢量程序的意外结果

[英]OpenCL, C++: Unexpected Results of simple sum float vector program

It is simple program that read two float4 vectors from files then calculate sum of opposite numbers. 这是一个简单的程序,可以从文件中读取两个float4向量,然后计算相反数字的总和。 The Result of it were not expected!! 结果不期望!!

The main File: 主文件:

#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <iomanip>
#include <array>
#include <fstream>
#include <sstream>
#include <string>
#include <algorithm>
#include <iterator>


#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#include <time.h>
#endif



const int number_of_points = 16;  // number of points in Both  A and B files (number of rows)
const int number_of_axis = 4;     // number of points axis in Both  A and B files (number of Columns)


using namespace std;

void checkError(cl_int err, const char *operation)
{
  if (err != CL_SUCCESS)
  {
    fprintf(stderr, "Error during operation '%s': %d\n", operation, err);
    exit(1);
  }
}

int main(int argc, char *argv[]) {
    clock_t tStart = clock();
    // Create the two input vectors
    // working variables
    int i;
    ifstream input_fileA, input_fileB;  // input files
    string line;    // transfer row from file to array
    float x;        // transfer word from file to array
    int row = 0;    // number of rows of file A,B (= array)
    int col = 0;    // number of rows of file A,B (= array)

    // working arrays

    // working arrays
//  int mem_size_TempA = number_of_points * number_of_axis * sizeof(cl_float);
//  int mem_size_TempB = number_of_points * number_of_axis * sizeof(cl_float);

    float tempAArray[number_of_points][number_of_axis]={{0}};   // array contains file A data
    float tempBArray[number_of_points][number_of_axis]={{0}};   // array contains file B data



    int mem_size_InputA = number_of_points ;
    int mem_size_InputB = number_of_points ;
    int mem_size_Output = number_of_points ;

    float *inputAArray = (float*) malloc(number_of_points*sizeof(cl_float4));   // array contains file A data
    float *inputBArray = (float*) malloc(number_of_points*sizeof(cl_float4));   // array contains file B data
    float *outputArray = (float*) malloc(number_of_points*sizeof(cl_float4));   // array contains file B data


    // import input files
    input_fileA.open(argv[1]);
    input_fileB.open(argv[2]);


    // transfer input files data to array
    // input file A to arrayA
    row = 0;
    while (getline(input_fileA, line))
    {

        istringstream streamA(line);
        col = 0;
        while(streamA >> x){
            tempAArray[row][col] = x;
            col++;
        }
        row++;
    }

    // input file B to arrayB
    row = 0;
    while (getline(input_fileB, line))
    {

        istringstream streamB(line);
        col = 0;
        while(streamB >> x){
            tempBArray[row][col] = x;
            col++;
        }
        row++;
    }

    // switch columns of B array
    for(int row_of_arrayB = 0; row_of_arrayB < number_of_points; row_of_arrayB++ )
    {
        float temporary = tempBArray[row_of_arrayB][2];
        tempBArray[row_of_arrayB][2] = tempBArray[row_of_arrayB][1];
        tempBArray[row_of_arrayB][1] = temporary;
    }

    // from Array to 3d vectors
//    for (int row_of_array = 0; row_of_array<number_of_points; row_of_array++)
//    {
//      inputAArray[row_of_array] = (tempAArray[row_of_array][0], tempAArray[row_of_array][1], tempAArray[row_of_array][2],0);
//      inputBArray[row_of_array] = (tempBArray[row_of_array][0], tempBArray[row_of_array][1], tempBArray[row_of_array][2],0);
//    }

    for (int row_of_array=0; row_of_array < number_of_points; row_of_array++)
    {

        inputAArray[row_of_array*4+0] = tempAArray[row_of_array][0];
        inputAArray[row_of_array*4+1] = tempAArray[row_of_array][1];
        inputAArray[row_of_array*4+2] = tempAArray[row_of_array][2];
        inputAArray[row_of_array*4+3] = 0.0f;

//      inputAArray[row_of_array]= float(4) (tempAArray[row_of_array][0], tempAArray[row_of_array][1], tempAArray[row_of_array][2], 0.0f);

        inputBArray[row_of_array*4+0] = tempBArray[row_of_array][0];
        inputBArray[row_of_array*4+1] = tempBArray[row_of_array][1];
        inputBArray[row_of_array*4+2] = tempBArray[row_of_array][2];
        inputBArray[row_of_array*4+3] = 0.0f;

        outputArray[row_of_array*4+0] = 0.0f;
        outputArray[row_of_array*4+1] = 0.0f;
        outputArray[row_of_array*4+2] = 0.0f;
        outputArray[row_of_array*4+3] = 0.0f;
//      inputBArray[row_of_array] = (tempBArray[row_of_array][0], tempBArray[row_of_array][1], tempBArray[row_of_array][2],0);

    }
//    for (int row_of_array=0; row_of_array < number_of_points; row_of_array++)
//    {
//      printf("0: %f, 1: %f, 2: %f, 3: %f \n", inputAArray[row_of_array*number_of_points+0], inputAArray[row_of_array*number_of_points+1],
//              inputAArray[row_of_array*number_of_points+2], inputAArray[row_of_array*number_of_points+3]);
//    }
    // close input files
    input_fileA.close();
    input_fileB.close();




    // Load the kernel source code into the array source_str
    FILE *fp;
    char *source_str;
    size_t source_size;

    fp = fopen("calculate_bottom_SNM_kernel.cl", "r");
    if (!fp) {
        fprintf(stderr, "Failed to load kernel.\n");
        exit(1);
    }

    fseek(fp, 0, SEEK_END);
    size_t programLength = ftell(fp);
    rewind(fp);

    source_str = (char*)malloc(programLength+1);
    source_size = fread( source_str, 1, programLength, fp);
    source_str[programLength] = '\0';
    fclose( fp );

    // Get platform and device information
    cl_platform_id platform_id = NULL;
    cl_device_id device_id = NULL;
    cl_uint ret_num_devices;
    cl_uint ret_num_platforms;
    cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
    ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_ALL, 1,
            &device_id, &ret_num_devices);

    // Create an OpenCL context
    cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);

    // Create a command queue
    cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);

    // Create memory buffers on the device for each vector
    cl_mem inputa_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
            mem_size_InputA*sizeof(cl_float4) , NULL, &ret);
    cl_mem inputb_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
            mem_size_InputB*sizeof(cl_float4), NULL, &ret);

    cl_mem output_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
            mem_size_Output*sizeof(cl_float4), NULL, &ret);


    // Copy the lists A and B to their respective memory buffers
    ret = clEnqueueWriteBuffer(command_queue, inputa_mem_obj, CL_TRUE, 0,
            mem_size_InputA*sizeof(cl_float4), inputAArray, 0, NULL, NULL);
    ret = clEnqueueWriteBuffer(command_queue, inputb_mem_obj, CL_TRUE, 0,
            mem_size_InputB*sizeof(cl_float4), inputBArray, 0, NULL, NULL);


    // Create a program from the kernel source
    cl_program program = clCreateProgramWithSource(context, 1,
            (const char **)&source_str, (const size_t *)&source_size, &ret);

    // Build the program

    ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
    if (ret == CL_BUILD_PROGRAM_FAILURE)
      {
        // Get size of build log
        size_t logSize;
        ret = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG,
                                    0, NULL, &logSize);
        checkError(ret, "getting build log size");

        // Get build log
        char log[logSize];
        ret = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG,
                                    logSize, log, NULL);
        checkError(ret, "getting build log");

        printf("OpenCL program build log:\n%s\n", log);
        exit(1);
      }


    // Create the OpenCL kernel
    cl_kernel kernel = clCreateKernel(program, "calculate_bottom_SNM", &ret);

    // Set the arguments of the kernel
    ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputa_mem_obj);
    ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&inputb_mem_obj);
    ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&output_mem_obj);

    // Execute the OpenCL kernel on the list
    size_t global_item_size = number_of_points; // Process the entire lists
    size_t local_item_size = 4; // Process in groups of 64

    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
            &global_item_size, &local_item_size, 0, NULL, NULL);

    // Read the memory buffer C on the device to the local variable C
//    int *C = (int*)malloc(sizeof(int)*number_of_points);


//    float *C = (float*)malloc(sizeof(float)*number_of_points);
    ret = clEnqueueReadBuffer(command_queue, output_mem_obj, CL_TRUE, 0,
            mem_size_Output, outputArray, 0, NULL, NULL);


    // Display the result to the screen
//    float buttomSNM = 0;
    for(i = 0; i < number_of_points; i++)
    {
            printf("%f + %f = %f, \n",inputAArray[i*4+0],inputBArray[i*4+0], outputArray[i*4+0]);
    }

    // Clean up
    ret = clFlush(command_queue);
    ret = clFinish(command_queue);
    ret = clReleaseKernel(kernel);
    ret = clReleaseProgram(program);
    ret = clReleaseMemObject(inputa_mem_obj);
    ret = clReleaseMemObject(inputb_mem_obj);
    ret = clReleaseMemObject(output_mem_obj);
    ret = clReleaseCommandQueue(command_queue);
    ret = clReleaseContext(context);
    free (inputAArray);
    free (inputBArray);
    free (outputArray);

printf("ALL Time taken: %.2fs\n", (double)(clock() - tStart)/CLOCKS_PER_SEC);
    return 0;
}

Kernel: 核心:

__kernel void calculate_bottom_SNM(__global float4 *inputAArray, __global float4 *inputBArray,
                         __global float4 *outputArray) {

    // Get the index of the current element
    int i = get_global_id(0);

    outputArray[i].x = inputAArray[i].x + inputBArray[i].x; // Do something with first component
    outputArray[i].y = inputAArray[i].y + inputBArray[i].y; // Do something with second component
    outputArray[i].z = inputAArray[i].z + inputBArray[i].z; // Do something with third component
    outputArray[i].w = inputAArray[i].w + inputBArray[i].w; // Do something with third component

}

The First input file A: 第一个输入文件A:

0   0.000000e+00    9.998994e-01    
1   1.000000e-03    9.998981e-01    
2   2.000000e-03    9.998967e-01    
3   3.000000e-03    9.998953e-01    
4   4.000000e-03    9.998939e-01    
5   5.000000e-03    9.998925e-01    
6   6.000000e-03    9.998911e-01    
7   7.000000e-03    9.998896e-01    
8   8.000000e-03    9.998881e-01    
9   9.000000e-03    9.998865e-01    
10  1.000000e-02    9.998850e-01    
11  1.100000e-02    9.998834e-01    
12  1.200000e-02    9.998817e-01    
13  1.300000e-02    9.998800e-01    
14  1.400000e-02    9.998783e-01    
15  1.500000e-02    9.998766e-01

The Second input file B: 第二个输入文件B:

0   0.000000e+00    9.998966e-01    
1   1.000000e-03    9.998953e-01    
2   2.000000e-03    9.998939e-01    
3   3.000000e-03    9.998925e-01    
4   4.000000e-03    9.998911e-01    
5   5.000000e-03    9.998896e-01    
6   6.000000e-03    9.998881e-01    
7   7.000000e-03    9.998866e-01    
8   8.000000e-03    9.998850e-01    
9   9.000000e-03    9.998834e-01    
10  1.000000e-02    9.998818e-01    
11  1.100000e-02    9.998801e-01    
12  1.200000e-02    9.998785e-01    
13  1.300000e-02    9.998767e-01    
14  1.400000e-02    9.998750e-01    
15  1.500000e-02    9.998732e-01

The Output should be the Results of sum last two files, I printed just first columns but it's same behavior for others: 输出应该是最后两个文件的和的结果,我只打印了第一列,但对于其他列则是相同的行为:

The Output: 输出:

0.000000 + 0.000000 = 0.000000, 
1.000000 + 1.000000 = 0.000000, 
2.000000 + 2.000000 = 0.000000, 
3.000000 + 3.000000 = 0.000000, 
4.000000 + 4.000000 = 0.000000, 
5.000000 + 5.000000 = 0.000000, 
6.000000 + 6.000000 = 0.000000, 
7.000000 + 7.000000 = 0.000000, 
8.000000 + 8.000000 = 0.000000, 
9.000000 + 9.000000 = 0.000000, 
10.000000 + 10.000000 = 0.000000, 
11.000000 + 11.000000 = 0.000000, 
12.000000 + 12.000000 = 0.000000, 
13.000000 + 13.000000 = 0.000000, 
14.000000 + 14.000000 = 0.000000, 
15.000000 + 15.000000 = 0.000000, 
ALL Time taken: 0.07s

Thanks in advance, 提前致谢,

You are not copying the correct number of bytes back from the device to the host: 您没有将正确数量的字节从设备复制回主机:

int mem_size_Output = number_of_points ;

...

ret = clEnqueueReadBuffer(command_queue, output_mem_obj, CL_TRUE, 0,
        mem_size_Output, outputArray, 0, NULL, NULL);

The amount of data in your buffer is number_of_points * sizeof(cl_float4) . 缓冲区中的数据量为number_of_points * sizeof(cl_float4)

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM