I am trying to use OpenCl to preform a vector multiplication, while the code itself seems to work the result returned is either garbage or zeros. From what I can tell it appears that it either the kernel is not receiving the correct values, there is something non-obvious to me that I am missing here, what is it? I thought it was the way I was allocating the buffers but I am uncertain.
#define CL_USE_DEPRECATED_OPENCL_2_0_APIS
#include <iostream>
#include <fstream>
#include <sstream>
#include "./cl.hpp"
void populate_vector(std::vector<float> &vect, std::stringstream &readStream) {
std::string x;
std::string fStripped;
float readFloat;
while(std::getline(readStream, x, ',')){
std::stringstream elementStream;
elementStream << x;
std::getline(elementStream, fStripped, 'f');
elementStream << fStripped;
elementStream >> readFloat;
vect.push_back(readFloat);
}
}
int main()
{
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
if(platforms.empty()){
throw std::runtime_error("No Platforms found, check OpenCL installation.");
}
cl::Platform platform = platforms[0];
std::cout << "Using Platform: " << platform.getInfo<CL_PLATFORM_NAME>() << std::endl;
std::vector<cl::Device> devices;
platform.getDevices(CL_DEVICE_TYPE_GPU, &devices);
if(devices.empty()){
throw std::runtime_error ("No Devices Found, check installation.");
}
cl::Device device = devices[0];
// Create an execusion context
cl::Context context(device);
cl::CommandQueue queue(context,device);
// Load the kernel sources, use global memory
std::ifstream fs("mCrossProd.cl");
if(!fs.is_open()){
throw std::runtime_error("Cannot open kernel source file.");
}
// Extract kernel code
std::stringstream ss;
ss << fs.rdbuf();
auto code = ss.str();
cl::Program::Sources sources;
sources.push_back({code.c_str(), code.length()});
fs.close();
// Build the kernel
cl::Program program(context, sources);
if(program.build({device})!= CL_SUCCESS){
std::cout << " Error building: " << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device) << "\n";
exit(1);
}
// Output matrix dimensions
int M = 4, N = 3, K = 6;
int A_dims = M * K;
int B_dims = N * K;
int C_dims = M * N;
// Create buffers for device
cl::Buffer buffer_A(context,CL_MEM_READ_WRITE,sizeof(float)*A_dims);
cl::Buffer buffer_B(context,CL_MEM_READ_WRITE,sizeof(float)*B_dims);
cl::Buffer buffer_C(context,CL_MEM_READ_WRITE,sizeof(float)*C_dims);
std::string s;
std::vector<float> A;
std::vector<float> B;
std::vector<float> C;
std::ifstream infile("matrices.txt");
std::cout << "Opened file to read" << '\n';
std::getline(infile,s);
//std::cout << s;
std::stringstream mss(s);
populate_vector(A, mss);
std::copy(A.begin(), A.end(), std::ostream_iterator<float>(std::cout, ", "));
std::cout << '\n';
mss.str("");
mss.clear();
std::getline(infile,s);
mss << s;
populate_vector(B, mss);
std::copy(B.begin(), B.end(), std::ostream_iterator<float>(std::cout, ", "));
std::cout << '\n';
mss.str("");
mss.clear();
std::getline(infile,s);
mss << s;
populate_vector(C, mss);
std::copy(C.begin(), C.end(), std::ostream_iterator<float>(std::cout, ", "));
std::cout << '\n';
//write arrays A and B to the device
queue.enqueueWriteBuffer(buffer_A,CL_TRUE,0,A.size()*sizeof(float),&A);
queue.enqueueWriteBuffer(buffer_B,CL_TRUE,0,B.size()*sizeof(float),&B);
std::cout << A.size() * sizeof(float) << '\n';
std::cout << B.size() * sizeof(float) << '\n';
std::cout << C.size() * sizeof(float) << '\n';
// Select kernel, pass arguments
cl::Kernel kernel = cl::Kernel(program, "mCrossProd");
kernel.setArg(0, M);
kernel.setArg(1, N);
kernel.setArg(2, K);
kernel.setArg(3, buffer_A);
kernel.setArg(4, buffer_B);
kernel.setArg(5, buffer_C);
// Execute kernel
if( queue.enqueueNDRangeKernel(kernel,cl::NullRange,cl::NDRange(M,N),cl::NDRange(1,1)) != CL_SUCCESS )
{
std::cout << "Failed to launch kernel" << std::endl;
exit(1);
}
queue.finish();
// read result C from the device to array C
queue.enqueueReadBuffer(buffer_C,CL_TRUE,0,C.size(),&C[0]);
std::cout << C.size() << std::endl;
std::cout << C_dims << std::endl;
std::cout << M << " " << N << std::endl;
std::cout << "\nThe solution is" << std::endl;
std::copy(C.begin(), C.end(), std::ostream_iterator<float>(std::cout, ", "));
std::cout << '\n';
for(int i = 0; i < M; i++) {
for(int j = 0; j < N; j++) {
std::cout << "C[" + std::to_string(i*N+j) + "] = ";
std::cout << C[i*N+j] << " ";
}
std::cout << std::endl;
}
}
The test Kernel
__kernel void mCrossProd(const int M, const int N, const int K, __global float* A, __global float* B, __global float* C) {
int const i = get_global_id(0);
int const j = get_global_id(1);
int const debug_elem_id = 3; // purely for debug purposes.
for(int k = 0; k < K; k++){
C[i*N+j] += A[i*K+k] * B[N*k+j];
if((i*N+j)==debug_elem_id)
{
//printf("PROD, i = %d, j = %d, k = %d, N = %d\n", i,j,k,N);
printf("PROD, %.2f\n", A[i*K+k] * B[N*k+j]);
printf("SUM: %.2f\n", C[i*N+j]);
}
}
}
The contents of matrices.txt
1.5f, 1.0f, 2.0f, 2.0f, 4.0f, 1.0f, 4.0f, 2.0f, 1.0f, 1.0f, 0.0f, 0.0f, 3.0f, 2.0f, 5.0f, 1.0f, 1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 2.0f, 1.0f, 1.0f
1.5f, 2.0f, 4.0f, 1.0f, 1.0f, 2.0f, 4.0f, 2.0f, 1.0f, 0.0f, 0.0f, 1.0f, 9.0f, 2.0f, 1.0f, 2.0f, 1.0f, 0.0f
0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f
queue.enqueueWriteBuffer(buffer_A,CL_TRUE,0,A.size()*sizeof(float),&A);
queue.enqueueWriteBuffer(buffer_B,CL_TRUE,0,B.size()*sizeof(float),&B);
&A
should be either A.data()
or &A[0]
, I recommend the first one
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.