簡體   English   中英

使用 std::vector 的 OpenCL 矩陣乘法

[英]OpenCL Matrix Multiplication Using std::vector

我正在嘗試使用 OpenCl 來執行向量乘法,而代碼本身似乎可以工作,返回的結果要么是垃圾要么是零。 據我所知,似乎內核沒有收到正確的值,我在這里遺漏了一些對我來說不明顯的東西,它是什么? 我認為這是我分配緩沖區的方式,但我不確定。

#define CL_USE_DEPRECATED_OPENCL_2_0_APIS

#include <iostream>
#include <fstream>
#include <sstream>
#include "./cl.hpp"

void populate_vector(std::vector<float> &vect, std::stringstream &readStream) {

    std::string x;
    std::string fStripped;
    float readFloat;

    while(std::getline(readStream, x, ',')){
        std::stringstream elementStream;
        elementStream << x;
        std::getline(elementStream, fStripped, 'f');
        elementStream << fStripped;
        elementStream >> readFloat;
        vect.push_back(readFloat);
    }

}

int main()
{
    std::vector<cl::Platform> platforms;
    cl::Platform::get(&platforms);
    if(platforms.empty()){
        throw std::runtime_error("No Platforms found, check OpenCL installation.");
    }

    cl::Platform platform = platforms[0];
    std::cout << "Using Platform: " << platform.getInfo<CL_PLATFORM_NAME>() << std::endl;
    std::vector<cl::Device> devices;
    platform.getDevices(CL_DEVICE_TYPE_GPU, &devices);
    if(devices.empty()){
        throw std::runtime_error ("No Devices Found, check installation.");
    }
    cl::Device device = devices[0];

    // Create an execusion context
    cl::Context context(device);

    cl::CommandQueue queue(context,device);


    // Load the kernel sources, use global memory
    std::ifstream fs("mCrossProd.cl");
    if(!fs.is_open()){
        throw  std::runtime_error("Cannot open kernel source file.");
    }

    // Extract kernel code
    std::stringstream ss;
    ss << fs.rdbuf();
    auto code = ss.str();
    cl::Program::Sources sources;
    sources.push_back({code.c_str(), code.length()});
    fs.close();

    // Build the kernel
    cl::Program program(context, sources);
    if(program.build({device})!= CL_SUCCESS){
        std::cout << " Error building: " << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device) << "\n";
        exit(1);
    }

    // Output matrix dimensions
    int M = 4, N = 3, K = 6;
    int A_dims = M * K;
    int B_dims = N * K;
    int C_dims = M * N;

    // Create buffers for device
    cl::Buffer buffer_A(context,CL_MEM_READ_WRITE,sizeof(float)*A_dims);
    cl::Buffer buffer_B(context,CL_MEM_READ_WRITE,sizeof(float)*B_dims);
    cl::Buffer buffer_C(context,CL_MEM_READ_WRITE,sizeof(float)*C_dims);

    std::string s;
    std::vector<float> A;
    std::vector<float> B;
    std::vector<float> C;
    std::ifstream infile("matrices.txt");
    std::cout << "Opened file to read" << '\n';


    std::getline(infile,s);
    //std::cout << s;
    std::stringstream mss(s);
    populate_vector(A, mss);   
    std::copy(A.begin(), A.end(), std::ostream_iterator<float>(std::cout, ", "));
    std::cout << '\n';

    mss.str("");
    mss.clear();
    std::getline(infile,s);
    mss << s;
    populate_vector(B, mss);  
    std::copy(B.begin(), B.end(), std::ostream_iterator<float>(std::cout, ", "));
    std::cout << '\n';

    mss.str("");
    mss.clear();
    std::getline(infile,s);
    mss << s;
    populate_vector(C, mss);
    std::copy(C.begin(), C.end(), std::ostream_iterator<float>(std::cout, ", "));
    std::cout << '\n';




    //write arrays A and B to the device
    queue.enqueueWriteBuffer(buffer_A,CL_TRUE,0,A.size()*sizeof(float),&A);
    queue.enqueueWriteBuffer(buffer_B,CL_TRUE,0,B.size()*sizeof(float),&B);

    std::cout << A.size() * sizeof(float) << '\n';
    std::cout << B.size() * sizeof(float) << '\n';
    std::cout << C.size() * sizeof(float) << '\n';

    // Select kernel, pass arguments
    cl::Kernel kernel = cl::Kernel(program, "mCrossProd");
    kernel.setArg(0, M);
    kernel.setArg(1, N);
    kernel.setArg(2, K);
    kernel.setArg(3, buffer_A);
    kernel.setArg(4, buffer_B);
    kernel.setArg(5, buffer_C);

    // Execute kernel
    if( queue.enqueueNDRangeKernel(kernel,cl::NullRange,cl::NDRange(M,N),cl::NDRange(1,1)) != CL_SUCCESS )
    {
        std::cout << "Failed to launch kernel" << std::endl;
        exit(1);
    }
    queue.finish();

    // read result C from the device to array C
    queue.enqueueReadBuffer(buffer_C,CL_TRUE,0,C.size(),&C[0]);
    std::cout << C.size() << std::endl;
    std::cout << C_dims << std::endl;
    std::cout << M << " " << N << std::endl;
    std::cout << "\nThe solution is" << std::endl;
    std::copy(C.begin(), C.end(), std::ostream_iterator<float>(std::cout, ", "));
    std::cout << '\n';

     for(int i = 0; i < M; i++) {
        for(int j = 0; j < N; j++) {
            std::cout << "C[" + std::to_string(i*N+j) + "] = ";
            std::cout << C[i*N+j] << " ";
        }
        std::cout << std::endl;
    }
}

測試內核

    __kernel void mCrossProd(const int M, const int N, const int K, __global float* A, __global float* B, __global float* C) {
    int const i = get_global_id(0);
    int const j = get_global_id(1);
    int const debug_elem_id = 3; // purely for debug purposes.

    for(int k = 0; k < K; k++){
        C[i*N+j] += A[i*K+k] * B[N*k+j];
        if((i*N+j)==debug_elem_id)
        {   
            //printf("PROD, i = %d, j = %d, k = %d, N = %d\n", i,j,k,N);
            printf("PROD, %.2f\n", A[i*K+k] * B[N*k+j]);
            printf("SUM: %.2f\n", C[i*N+j]);
        }
    }
}

matrices.txt 的內容

1.5f, 1.0f, 2.0f, 2.0f, 4.0f, 1.0f, 4.0f, 2.0f, 1.0f, 1.0f, 0.0f, 0.0f, 3.0f, 2.0f, 5.0f, 1.0f, 1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 2.0f, 1.0f, 1.0f
1.5f, 2.0f, 4.0f, 1.0f, 1.0f, 2.0f, 4.0f, 2.0f, 1.0f, 0.0f, 0.0f, 1.0f, 9.0f, 2.0f, 1.0f, 2.0f, 1.0f, 0.0f
0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f
    queue.enqueueWriteBuffer(buffer_A,CL_TRUE,0,A.size()*sizeof(float),&A);
    queue.enqueueWriteBuffer(buffer_B,CL_TRUE,0,B.size()*sizeof(float),&B);

&A應該是A.data()&A[0] ,我推薦第一個

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM