在OpenCL中实施

Question

我一直在尝试编写一个程序，该程序用某些分母来计算Cantor集上的有理数。 我发现用计算机计算3 ^ 14到3 ^ 15之间的数字需要20个小时或更长时间。 我认为，由于这正在测试大量单独的值，因此在具有OpenCL的图形卡上实现将是一件好事。 当我尝试实现它时，我得到的性能比我的CPU实现要慢几个数量级。 这是我尝试的代码。

#define __CL_ENABLE_EXCEPTIONS
#include <CL/cl.hpp>
#include <functional>
#include <ctime>
#include <iostream>
#include <fstream>
#include <exception>
#include <cstdlib>
#include <vector>
#include <thread>
#include <cmath>
#include <string>
#include <algorithm>
#include <thread>
#include <cmath>
#include <sstream>

#define SUCCESS 0
#define FAILURE 1
#define EXPECTED_FAILURE 2

const int NUM_ELEMENTS = 32768;

void printOutput(unsigned long long start, unsigned long long *values){
    for(unsigned int i = 0; i < NUM_ELEMENTS; i++)
       if (values[i] != 0)
            std::cout << start+i << ',' << values[i] << std::endl;
}

void newList(unsigned long long start, unsigned long long *dataList){
    for(int i=0; i < NUM_ELEMENTS; ++i)
        dataList[i] = start + i;
}

using namespace cl;

Kernel kernelA;
Context context;
CommandQueue queue;
Buffer inputBuffer;
Buffer outputBuffer;

int init() {
    cl_int status = 0;
    const char* buildOption ="-x clc++ ";
    std::vector<Platform> platforms;
    status = Platform::get(&platforms);
    if (status != CL_SUCCESS){
        std::cout<<"Error: Getting platforms!"<<std::endl;
        return FAILURE;
    }
    std::vector<cl::Platform>::iterator iter;
    for(iter = platforms.begin(); iter != platforms.end(); ++iter)
        if(!strcmp((*iter).getInfo<CL_PLATFORM_VENDOR>().c_str(), "Advanced Micro Devices, Inc."))
            break;
    cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)(*iter) (), 0};
    bool gpuNotFound = false;
    try{
        context = cl::Context(CL_DEVICE_TYPE_GPU, cps, NULL, NULL, &status);
    }
    catch(std::exception e){
        gpuNotFound = true;
    }
    if(gpuNotFound){
        std::cout<<"GPU not found, falling back to CPU!"<<std::endl;
        context = cl::Context(CL_DEVICE_TYPE_CPU, cps, NULL, NULL, &status);
        if (status != CL_SUCCESS){
            std::cout<<"Error: Creating context!"<<std::endl;
            return FAILURE;
        }
    }
    Program program;
    try{
        std::vector<Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
        queue = CommandQueue(context, devices[0]);
        std::ifstream sourceFile("Rationals.cl");
        std::string sourceCode(
            std::istreambuf_iterator<char>(sourceFile),
            (std::istreambuf_iterator<char>()));
        Program::Sources source(1, std::make_pair(sourceCode.c_str(), sourceCode.length()+1));
        program = Program(context, source);
        program.build(devices, buildOption);
        kernelA = Kernel(program, "countRationals");
        inputBuffer = Buffer(context, CL_MEM_READ_WRITE, NUM_ELEMENTS * sizeof(unsigned long long));
        outputBuffer = Buffer(context, CL_MEM_READ_WRITE, NUM_ELEMENTS * sizeof(unsigned long long));
    }catch(cl::Error e){
        std::cout << e.what() << std::endl;
        std::cout << "Build Status: " << program.getBuildInfo<CL_PROGRAM_BUILD_STATUS>(cl::Device::getDefault()) << std::endl;
        std::cout << "Build Options:\t" << program.getBuildInfo<CL_PROGRAM_BUILD_OPTIONS>(cl::Device::getDefault()) << std::endl;
        std::cout << "Build Log:\t " << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(cl::Device::getDefault()) << std::endl;
        return FAILURE;
    }
    return SUCCESS;
}

int execute(unsigned long long* inputList, unsigned long long* outputList) {
    try{
        queue.enqueueWriteBuffer(inputBuffer, CL_TRUE, 0, NUM_ELEMENTS * sizeof(unsigned long long), inputList);
        kernelA.setArg(0, inputBuffer);
        kernelA.setArg(1, outputBuffer);
        NDRange global(NUM_ELEMENTS/2);
        NDRange local(256);
        queue.enqueueNDRangeKernel(kernelA, NullRange, global, local);
        queue.enqueueReadBuffer(outputBuffer, CL_TRUE, 0, NUM_ELEMENTS * sizeof(unsigned long long), outputList);
    }catch(cl::Error e){
        std::cout << "Line "<< __LINE__<<": Error in "<<e.what() <<std::endl;
        return FAILURE;
    }
    return SUCCESS;
}

using namespace std;

int main(int argc, char* argv[]){
    unsigned long long minNum, maxNum;
    if (argc == 2){
        minNum = pow(3, atoi(argv[1]));
        maxNum = pow(3, atoi(argv[1]) + 1);
    }
    else if (argc == 3){
        minNum = pow(3, atoi(argv[1]));
        maxNum = pow(3, atoi(argv[2]));
    }
    else if (argc == 4){
        minNum = pow(3, atoi(argv[1]));
        maxNum = pow(3, atoi(argv[2]));
    }
    else return -1;
    unsigned long long *q = nullptr, *result = nullptr, *old = nullptr, *newq = nullptr;
    thread workThread, outThread, genThread;
    q = new unsigned long long[NUM_ELEMENTS];
    newList(minNum, q);
    result = new unsigned long long[NUM_ELEMENTS];
    newq = new unsigned long long[NUM_ELEMENTS];
    init();
    genThread = thread(newList, minNum+NUM_ELEMENTS, newq);
    workThread = thread(execute, q, result);
    workThread.join();
    genThread.join();
    for(unsigned long long i = minNum + NUM_ELEMENTS; i < maxNum  + NUM_ELEMENTS; i += NUM_ELEMENTS){
        old = result;
        q = newq;
        result = new unsigned long long[NUM_ELEMENTS];
        newq = new unsigned long long[NUM_ELEMENTS];
        genThread = thread(newList, i+NUM_ELEMENTS, newq);
        workThread = thread(execute, q, result);
        outThread = thread(printOutput, i-NUM_ELEMENTS, old);
        workThread.join();
        outThread.join();
        genThread.join();
        delete[] old;
        delete[] q;
        q = old = nullptr;
    }
    delete[] newq;
    delete[] result;
    return 0;
}

和内核代码

bool testCantor(unsigned long p, unsigned long q){
    while(q % 3 == 0){
        q /= 3;
        if (p/q == 1) return p==q;
        p %= q;
    }
    unsigned long p_start = p;
    do{
        unsigned long p3 = p * 3;
        if(p3/q == 1) return false;
        p = p3 % q;
    } while(p != p_start);
    return true;
}

int coprime(unsigned long a, unsigned long b){
    unsigned long c;
    while (a != 0){
        c = a;
        a = b % a;
        b = c;
    }
    return 2*((b == 1)&1);
}

__kernel
void countRationals(__global unsigned long *input, __global unsigned long *output){
    int gid = get_global_id(0);
    unsigned long q = input[gid], p = 1;
    output[gid] = 0;
    for(p = 1; p <= q/3; p++){
        if(p % 3 != 0 && testCantor(p, q))
            for(unsigned long i = p; i <= q/3; i *= 3)
                    output[gid] += coprime(i,q);
    }
    gid = 32767 - get_global_id(0);
    q = input[gid];
    output[gid] = 0;
    for(p = 1; p <= q/3; p++){
        if(p % 3 != 0 && testCantor(p, q))
            for(unsigned long i = p; i <= q/3; i *= 3)
                    output[gid] +=  coprime(i,q);
    }
}

我有更好的方法来实现这一点吗？ 我对OpenCL很陌生（我不到24小时就开始使用它），因此可能犯了一些相当明显的错误。

编辑：我发现我只是产生2个线程。 我将其更改为产生32个线程，每个线程256个q。 现在，当我从13运行到14时，它崩溃了，我也不知道为什么。 它不会从10崩溃到11

EDIT2：我实现了大多数建议（无法弄清楚如何删除if（coprime（p，q））），现在它的运行速度更快（在n = 10处小于第二个差）。 我还有很多其他方法可以加快速度吗？ 在同一任务上，它的运行速度仅比我的处理器快33％。

EDIT3：设法用位旋转实现它。 不知道是否还有其他条件可以做到。 仍然看不到很大的性能提升（有什么建议吗？）

Answer 1

int execute(unsigned long long* inputList, unsigned long long* outputList) {
    try
    {
       ...
    }
    catch(cl::Error e)
    {
       ...
    }
    return SUCCESS;

正在创建缓冲区。 如果您多次使用execute（），它将产生缓冲区创建/垃圾收集开销。 另外，您的全局范围只是本地范围的两倍，这意味着将仅使用gpu的两个计算单元。 如果您的卡有20个计算单元，则全局范围应至少为40 *局部范围。 仅512个元素不足以使gpu繁忙。 至少有一半的核心。 对于所有内核而言，for（p = 1; p <= q / 3; p ++）循环并不相同。 一些内核数为10，而另一个内核数为100，这破坏了内核之间的执行顺序。 您应该制作一个更加平衡的内核。 例如：

馈入第一个核心以计算第一个和最后一个元素，第二个核心供以计算第二个和第N-1st个元素，因此所有核心执行几乎相等的工作，而不是闲置地等待后面的核心。

__kernel
void countRationals(__global unsigned long *input, __global unsigned long *output)
{
    // computing first element (least workload among the array)
    int gid = get_global_id(0);
    unsigned long q = input[gid], p = 1;
    output[gid] = 0;
    for(p = 1; p <= q/3; p++) // counts to 10 ....
    {
        if(p % 3 != 0 && testCantor(p, q))
            for(unsigned long i = p; i <= q/3; i *= 3)
                if(coprime(i,q))
                    output[gid] += 2;
    }

    //+ computing (N-gid) element (heaviest workload among the array)
    int N_gid = findOtherIndex(get_global_id(0));
    unsigned long N_q = input[N_gid], N_p = 1;
    output[N_gid] = 0;
    for(N_p = 1; N_p <= N_q/3; N_p++) // counts to 100? 
    {
        if(N_p % 3 != 0 && testCantor(N_p, N_q))
            for(unsigned long i = p; i <= q/3; i *= 3)
                if(coprime(i,N_q))
                    output[N_gid] += 2;
    }

     //this way, adjacent cores will have "closer to equal"  work. 

}

因此，如果您有4096个元素，则第一个内核将计算第1个和第4096个元素，第二个内核将计算第2个和第4095个元素，..开始时本地范围为64，全局范围为4096。 如果使用过多的“ if”，则应为每个变量放置一个“ else”以进行虚拟工作，以保持内核之间的计算顺序。 或者，如果它们很简单，则可以删除一些“如果”：

 if(a>b)c+=d;

可以被截获为

 c+=d*bitTwiddle_and_absoluteValue(a,b); // does only computation, not branching is good for gpu.
 implement bitTwiddle_and_absoluteValue(a,b) such that it returns zero when a<=b and 1 when a>b

编辑：

 giving global size a multiple of number of cores of GPU could give an extre performance.

编辑：让我们优化

 for(p = 1; p <= q/3; p++){
        if(p % 3 != 0 && testCantor(p, q))
            for(unsigned long i = p; i <= q/3; i *= 3)
                    output[gid] +=  coprime(i,q);
    }

p％3！= 0表示仅1或2。

p％3 == 1满足p = 1,4,7,10，... =>我们的第一个循环

p％3 == 2满足p = 2,5,8,11，... =>我们的第二个循环

让它们串联起来：

 for(p = 1; p <= q/3; p+=3){ // p%3==1 is satisfied
        if(testCantor(p, q)) // so no need for testing modulus
            for(unsigned long i = p; i <= q/3; i *= 3)
                    output[gid] +=  coprime(i,q);
    }

 for(p = 2; p <= q/3; p+=3){ // p%3==2 is satisfied
        if(testCantor(p, q)) // so no need for testing modulus 
            for(unsigned long i = p; i <= q/3; i *= 3)
                    output[gid] +=  coprime(i,q);
    }

//so we got rid of this part:
for(p = 0; p <= q/3; p+=3){      // p%3==0 is not engaging "if" so we dont need
            if(testCantor(p, q))                     // this loop anymore lol :D
                for(unsigned long i = p; i <= q/3; i *= 3)
                        output[gid] +=  coprime(i,q);
        }

另外，总循环迭代次数减少了1/3，这应该会有所提高。

编辑：while循环具有一个模数，并且不使用GPU的浮点电位。

//here convert integers to floats a,b,c
while (a != 0){ // this will need a tolerance range, exact zero is nearly impossible
        c = a;
        a = b % a; //emulate this using fp
         // example: 5%3 --> 5.0 / 3.0 gives 1.yyy so we have 1 at least
         // then we subtract like: 5.0 - floor(5.0/3.0)*3.0
         // we have 2.0 which is 5%3
         // this is just a single condition
         // looks like b%a can be b-floor(b/a)*a but Im not sure
         // good luck!
        b = c;
    }
// here convert floats back to integers again

How can one emulate modulus with using only fp arithmetics without losing precision?

在OpenCL中实施

问题描述

1 个解决方案

解决方案1
1 已采纳 2014-02-12 06:41:54

在OpenCL中实施

问题描述

1 个解决方案

解决方案1 1 已采纳 2014-02-12 06:41:54

解决方案1
1 已采纳 2014-02-12 06:41:54