[英]Implementing in OpenCL
我一直在尝试编写一个程序,该程序用某些分母来计算Cantor集上的有理数。 我发现用计算机计算3 ^ 14到3 ^ 15之间的数字需要20个小时或更长时间。 我认为,由于这正在测试大量单独的值,因此在具有OpenCL的图形卡上实现将是一件好事。 当我尝试实现它时,我得到的性能比我的CPU实现要慢几个数量级。 这是我尝试的代码。
#define __CL_ENABLE_EXCEPTIONS
#include <CL/cl.hpp>
#include <functional>
#include <ctime>
#include <iostream>
#include <fstream>
#include <exception>
#include <cstdlib>
#include <vector>
#include <thread>
#include <cmath>
#include <string>
#include <algorithm>
#include <thread>
#include <cmath>
#include <sstream>
#define SUCCESS 0
#define FAILURE 1
#define EXPECTED_FAILURE 2
const int NUM_ELEMENTS = 32768;
void printOutput(unsigned long long start, unsigned long long *values){
for(unsigned int i = 0; i < NUM_ELEMENTS; i++)
if (values[i] != 0)
std::cout << start+i << ',' << values[i] << std::endl;
}
void newList(unsigned long long start, unsigned long long *dataList){
for(int i=0; i < NUM_ELEMENTS; ++i)
dataList[i] = start + i;
}
using namespace cl;
Kernel kernelA;
Context context;
CommandQueue queue;
Buffer inputBuffer;
Buffer outputBuffer;
int init() {
cl_int status = 0;
const char* buildOption ="-x clc++ ";
std::vector<Platform> platforms;
status = Platform::get(&platforms);
if (status != CL_SUCCESS){
std::cout<<"Error: Getting platforms!"<<std::endl;
return FAILURE;
}
std::vector<cl::Platform>::iterator iter;
for(iter = platforms.begin(); iter != platforms.end(); ++iter)
if(!strcmp((*iter).getInfo<CL_PLATFORM_VENDOR>().c_str(), "Advanced Micro Devices, Inc."))
break;
cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)(*iter) (), 0};
bool gpuNotFound = false;
try{
context = cl::Context(CL_DEVICE_TYPE_GPU, cps, NULL, NULL, &status);
}
catch(std::exception e){
gpuNotFound = true;
}
if(gpuNotFound){
std::cout<<"GPU not found, falling back to CPU!"<<std::endl;
context = cl::Context(CL_DEVICE_TYPE_CPU, cps, NULL, NULL, &status);
if (status != CL_SUCCESS){
std::cout<<"Error: Creating context!"<<std::endl;
return FAILURE;
}
}
Program program;
try{
std::vector<Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
queue = CommandQueue(context, devices[0]);
std::ifstream sourceFile("Rationals.cl");
std::string sourceCode(
std::istreambuf_iterator<char>(sourceFile),
(std::istreambuf_iterator<char>()));
Program::Sources source(1, std::make_pair(sourceCode.c_str(), sourceCode.length()+1));
program = Program(context, source);
program.build(devices, buildOption);
kernelA = Kernel(program, "countRationals");
inputBuffer = Buffer(context, CL_MEM_READ_WRITE, NUM_ELEMENTS * sizeof(unsigned long long));
outputBuffer = Buffer(context, CL_MEM_READ_WRITE, NUM_ELEMENTS * sizeof(unsigned long long));
}catch(cl::Error e){
std::cout << e.what() << std::endl;
std::cout << "Build Status: " << program.getBuildInfo<CL_PROGRAM_BUILD_STATUS>(cl::Device::getDefault()) << std::endl;
std::cout << "Build Options:\t" << program.getBuildInfo<CL_PROGRAM_BUILD_OPTIONS>(cl::Device::getDefault()) << std::endl;
std::cout << "Build Log:\t " << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(cl::Device::getDefault()) << std::endl;
return FAILURE;
}
return SUCCESS;
}
int execute(unsigned long long* inputList, unsigned long long* outputList) {
try{
queue.enqueueWriteBuffer(inputBuffer, CL_TRUE, 0, NUM_ELEMENTS * sizeof(unsigned long long), inputList);
kernelA.setArg(0, inputBuffer);
kernelA.setArg(1, outputBuffer);
NDRange global(NUM_ELEMENTS/2);
NDRange local(256);
queue.enqueueNDRangeKernel(kernelA, NullRange, global, local);
queue.enqueueReadBuffer(outputBuffer, CL_TRUE, 0, NUM_ELEMENTS * sizeof(unsigned long long), outputList);
}catch(cl::Error e){
std::cout << "Line "<< __LINE__<<": Error in "<<e.what() <<std::endl;
return FAILURE;
}
return SUCCESS;
}
using namespace std;
int main(int argc, char* argv[]){
unsigned long long minNum, maxNum;
if (argc == 2){
minNum = pow(3, atoi(argv[1]));
maxNum = pow(3, atoi(argv[1]) + 1);
}
else if (argc == 3){
minNum = pow(3, atoi(argv[1]));
maxNum = pow(3, atoi(argv[2]));
}
else if (argc == 4){
minNum = pow(3, atoi(argv[1]));
maxNum = pow(3, atoi(argv[2]));
}
else return -1;
unsigned long long *q = nullptr, *result = nullptr, *old = nullptr, *newq = nullptr;
thread workThread, outThread, genThread;
q = new unsigned long long[NUM_ELEMENTS];
newList(minNum, q);
result = new unsigned long long[NUM_ELEMENTS];
newq = new unsigned long long[NUM_ELEMENTS];
init();
genThread = thread(newList, minNum+NUM_ELEMENTS, newq);
workThread = thread(execute, q, result);
workThread.join();
genThread.join();
for(unsigned long long i = minNum + NUM_ELEMENTS; i < maxNum + NUM_ELEMENTS; i += NUM_ELEMENTS){
old = result;
q = newq;
result = new unsigned long long[NUM_ELEMENTS];
newq = new unsigned long long[NUM_ELEMENTS];
genThread = thread(newList, i+NUM_ELEMENTS, newq);
workThread = thread(execute, q, result);
outThread = thread(printOutput, i-NUM_ELEMENTS, old);
workThread.join();
outThread.join();
genThread.join();
delete[] old;
delete[] q;
q = old = nullptr;
}
delete[] newq;
delete[] result;
return 0;
}
和内核代码
bool testCantor(unsigned long p, unsigned long q){
while(q % 3 == 0){
q /= 3;
if (p/q == 1) return p==q;
p %= q;
}
unsigned long p_start = p;
do{
unsigned long p3 = p * 3;
if(p3/q == 1) return false;
p = p3 % q;
} while(p != p_start);
return true;
}
int coprime(unsigned long a, unsigned long b){
unsigned long c;
while (a != 0){
c = a;
a = b % a;
b = c;
}
return 2*((b == 1)&1);
}
__kernel
void countRationals(__global unsigned long *input, __global unsigned long *output){
int gid = get_global_id(0);
unsigned long q = input[gid], p = 1;
output[gid] = 0;
for(p = 1; p <= q/3; p++){
if(p % 3 != 0 && testCantor(p, q))
for(unsigned long i = p; i <= q/3; i *= 3)
output[gid] += coprime(i,q);
}
gid = 32767 - get_global_id(0);
q = input[gid];
output[gid] = 0;
for(p = 1; p <= q/3; p++){
if(p % 3 != 0 && testCantor(p, q))
for(unsigned long i = p; i <= q/3; i *= 3)
output[gid] += coprime(i,q);
}
}
我有更好的方法来实现这一点吗? 我对OpenCL很陌生(我不到24小时就开始使用它),因此可能犯了一些相当明显的错误。
编辑:我发现我只是产生2个线程。 我将其更改为产生32个线程,每个线程256个q。 现在,当我从13运行到14时,它崩溃了,我也不知道为什么。 它不会从10崩溃到11
EDIT2:我实现了大多数建议(无法弄清楚如何删除if(coprime(p,q))),现在它的运行速度更快(在n = 10处小于第二个差)。 我还有很多其他方法可以加快速度吗? 在同一任务上,它的运行速度仅比我的处理器快33%。
EDIT3:设法用位旋转实现它。 不知道是否还有其他条件可以做到。 仍然看不到很大的性能提升(有什么建议吗?)
int execute(unsigned long long* inputList, unsigned long long* outputList) {
try
{
...
}
catch(cl::Error e)
{
...
}
return SUCCESS;
正在创建缓冲区。 如果您多次使用execute(),它将产生缓冲区创建/垃圾收集开销。 另外,您的全局范围只是本地范围的两倍,这意味着将仅使用gpu的两个计算单元。 如果您的卡有20个计算单元,则全局范围应至少为40 *局部范围。 仅512个元素不足以使gpu繁忙。 至少有一半的核心。 对于所有内核而言,for(p = 1; p <= q / 3; p ++)循环并不相同。 一些内核数为10,而另一个内核数为100,这破坏了内核之间的执行顺序。 您应该制作一个更加平衡的内核。 例如:
馈入第一个核心以计算第一个和最后一个元素,第二个核心供以计算第二个和第N-1st个元素,因此所有核心执行几乎相等的工作,而不是闲置地等待后面的核心。
__kernel
void countRationals(__global unsigned long *input, __global unsigned long *output)
{
// computing first element (least workload among the array)
int gid = get_global_id(0);
unsigned long q = input[gid], p = 1;
output[gid] = 0;
for(p = 1; p <= q/3; p++) // counts to 10 ....
{
if(p % 3 != 0 && testCantor(p, q))
for(unsigned long i = p; i <= q/3; i *= 3)
if(coprime(i,q))
output[gid] += 2;
}
//+ computing (N-gid) element (heaviest workload among the array)
int N_gid = findOtherIndex(get_global_id(0));
unsigned long N_q = input[N_gid], N_p = 1;
output[N_gid] = 0;
for(N_p = 1; N_p <= N_q/3; N_p++) // counts to 100?
{
if(N_p % 3 != 0 && testCantor(N_p, N_q))
for(unsigned long i = p; i <= q/3; i *= 3)
if(coprime(i,N_q))
output[N_gid] += 2;
}
//this way, adjacent cores will have "closer to equal" work.
}
因此,如果您有4096个元素,则第一个内核将计算第1个和第4096个元素,第二个内核将计算第2个和第4095个元素,..开始时本地范围为64,全局范围为4096。 如果使用过多的“ if”,则应为每个变量放置一个“ else”以进行虚拟工作,以保持内核之间的计算顺序。 或者,如果它们很简单,则可以删除一些“如果”:
if(a>b)c+=d;
可以被截获为
c+=d*bitTwiddle_and_absoluteValue(a,b); // does only computation, not branching is good for gpu.
implement bitTwiddle_and_absoluteValue(a,b) such that it returns zero when a<=b and 1 when a>b
编辑:
giving global size a multiple of number of cores of GPU could give an extre performance.
编辑:让我们优化
for(p = 1; p <= q/3; p++){
if(p % 3 != 0 && testCantor(p, q))
for(unsigned long i = p; i <= q/3; i *= 3)
output[gid] += coprime(i,q);
}
p%3!= 0表示仅1或2。
p%3 == 1满足p = 1,4,7,10,... =>我们的第一个循环
p%3 == 2满足p = 2,5,8,11,... =>我们的第二个循环
让它们串联起来:
for(p = 1; p <= q/3; p+=3){ // p%3==1 is satisfied
if(testCantor(p, q)) // so no need for testing modulus
for(unsigned long i = p; i <= q/3; i *= 3)
output[gid] += coprime(i,q);
}
for(p = 2; p <= q/3; p+=3){ // p%3==2 is satisfied
if(testCantor(p, q)) // so no need for testing modulus
for(unsigned long i = p; i <= q/3; i *= 3)
output[gid] += coprime(i,q);
}
//so we got rid of this part:
for(p = 0; p <= q/3; p+=3){ // p%3==0 is not engaging "if" so we dont need
if(testCantor(p, q)) // this loop anymore lol :D
for(unsigned long i = p; i <= q/3; i *= 3)
output[gid] += coprime(i,q);
}
另外,总循环迭代次数减少了1/3,这应该会有所提高。
编辑:while循环具有一个模数,并且不使用GPU的浮点电位。
//here convert integers to floats a,b,c
while (a != 0){ // this will need a tolerance range, exact zero is nearly impossible
c = a;
a = b % a; //emulate this using fp
// example: 5%3 --> 5.0 / 3.0 gives 1.yyy so we have 1 at least
// then we subtract like: 5.0 - floor(5.0/3.0)*3.0
// we have 2.0 which is 5%3
// this is just a single condition
// looks like b%a can be b-floor(b/a)*a but Im not sure
// good luck!
b = c;
}
// here convert floats back to integers again
How can one emulate modulus with using only fp arithmetics without losing precision?
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.