I developed a small code to add two small vector using GPU by OpenCL library. The main code vectorAdd.cc
is as follows:
#include <iostream>
#include <CL/cl.hpp>
#include <cassert>
#include <fstream>
#include <time.h>
#include <cmath>
void randomInit(float *data, int size)
{
for (unsigned int i = 0; i < size; ++i)
data[i] = rand() / (float)RAND_MAX;
}
int main()
{
//get all platforms (drivers)
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
assert(platforms.size() > 0);
cl::Platform myPlatform = platforms[0];
std::cout << "Using platform: "<<myPlatform.getInfo<CL_PLATFORM_NAME>()<<"\n";
//get default device of the default platform
std::vector<cl::Device> devices;
myPlatform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
assert(devices.size() > 0);
cl::Device myDevice = devices[0];
std::cout<< "Using device: "<<myDevice.getInfo<CL_DEVICE_NAME>()<<"\n";
std::ifstream vectorAddFile("vector_add_kernel.cl" );
std::string src(std::istreambuf_iterator<char>(vectorAddFile), (std::istreambuf_iterator<char>()));
cl::Program::Sources sources(1, std::make_pair(src.c_str(), src.length() + 1));
cl::Context context(myDevice);
cl::Program program(context, sources);
int szVec = 10;
float* A = new float[szVec];
float* B = new float[szVec];
randomInit(A,szVec);
randomInit(B,szVec);
float* C = new float[szVec];
std::fill_n(C, szVec, 0);
// create buffers on the device
cl::Buffer buffer_A = cl::Buffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, szVec * sizeof(float), A);
cl::Buffer buffer_B = cl::Buffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, szVec * sizeof(float), B);
cl::Buffer buffer_C = cl::Buffer(context, CL_MEM_WRITE_ONLY|CL_MEM_COPY_HOST_PTR, szVec * sizeof(float), C);
//create queue to which we will push commands for the device.
cl::CommandQueue queue(context, myDevice);
//write arrays A and B to the device
//queue.enqueueWriteBuffer(buffer_A, CL_TRUE, 0, sizeof(float) * szVec, A);
//queue.enqueueWriteBuffer(buffer_B, CL_TRUE, 0, sizeof(float) * szVec, B);
auto err = program.build("cl.std.CL1.2");
// run the kernel
cl::Kernel kernel(program,"vector_add", &err);
kernel.setArg(0, buffer_A);
kernel.setArg(1, buffer_B);
kernel.setArg(2, buffer_C);
queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(szVec), cl::NullRange);
queue.finish();
//read result C from the device to array C
queue.enqueueReadBuffer(buffer_C, CL_TRUE, 0, sizeof(float) * szVec, C);
std::cout<<" result: \n";
for(int i = 0; i < szVec; i++)
{
std::cout << A[i] << " + " << B[i] << " = " << C[i] << std::endl;
}
std::cout << std::endl;
return 0;
}
and the kernel code vector_add_kernel.cl
is as follows:
__kernel void vector_add(__global float *A, __global float *B, __global float *C)
{
// Get the index of the current element
int i = get_global_id(0);
// Do the operation
C[i] = A[i] + B[i];
}
and the result i got is:
Using platform: NVIDIA CUDA
Using device: Tesla K20m
result:
0.840188 + 0.477397 = 0
0.394383 + 0.628871 = 0
0.783099 + 0.364784 = 0
0.79844 + 0.513401 = 0
0.911647 + 0.95223 = 0
0.197551 + 0.916195 = 0
0.335223 + 0.635712 = 0
0.76823 + 0.717297 = 0
0.277775 + 0.141603 = 0
0.55397 + 0.606969 = 0
The problem as you can see, the result is always what I initialized vector C
, I do not understand why. I also initialized vector C
with some other values and again the result was the initial values.
It's probably just a syntax error.
auto err = program.build("cl.std.CL1.2");
should be
auto err = program.build("-cl-std=CL1.2");
The documentation on clBuildProgram has more information about the supported options.
The problem stems from building the program with this command
auto err = program.build("cl.std.CL1.2");
and by replacing the command above with
auto err = program.build();
The problem solved. But still I do not know why this happened. Any idea?
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.