[英]CUDA Kernel Scheduler on GPU
I'm writing a CUDA kernel scheduler. 我正在编写一个CUDA内核调度程序。 The scheduler gets a vector of
Task
pointers and brings them to execution. 调度程序获取
Task
指针的向量并将它们执行。 The pointers point to KernelTask
objects of different type parameters to support kernels with arbitrary parameters. 指针指向不同类型参数的
KernelTask
对象,以支持具有任意参数的内核。
There's a CPU version of the Scheduler and a GPU version. 有一个CPU版本的Scheduler和一个GPU版本。 The CPU version works just fine.
CPU版本工作得很好。 It calls the virtual function
Task::start
to execute a Kernel. 它调用虚函数
Task::start
来执行内核。 The GPU version has three problems: GPU版本有三个问题:
KernelTask
objects are of diffrent size I copy all of them seperatly with copyToGPU()
. KernelTask
对象具有不同的大小,所以我使用copyToGPU()
复制所有这些对象。 Is there a way for batch copying? Here is the code: 这是代码:
// see http://stackoverflow.com/questions/7858817/unpacking-a-tuple-to-call-a-matching-function-pointer
template<int ...>
struct seq { };
template<int N, int ...S>
struct gens : gens<N-1, N-1, S...> { };
template<int ...S>
struct gens<0, S...> {
typedef seq<S...> type;
};
class Task {
private:
bool visited;
bool reached;
protected:
std::vector<std::shared_ptr<Task>> dependsOn;
Task();
public:
Task **d_dependsOn = NULL;
int d_dependsOnSize;
Task *d_self = NULL;
int streamId;
int id;
cudaStream_t stream;
virtual void copyToGPU() = 0;
virtual void start() = 0;
virtual void d_start() = 0;
virtual ~Task() {}
void init();
void addDependency(std::shared_ptr<Task> t);
cudaStream_t dfs();
};
template<typename... Args>
class KernelTask : public Task {
private:
std::tuple<Args...> params;
dim3 threads;
dim3 blocks;
void (*kfp)(Args...);
template<int ...S>
void callFunc(seq<S...>) {
// inserting task into stream
this->kfp<<<this->blocks, this->threads, 0, this->stream>>>(std::get<S>(params) ...);
checkCudaErrors(cudaGetLastError());
if (DEBUG) printf("Task %d: Inserting Task in Stream.\n", this->id);
}
template<int ...S>
__device__ void d_callFunc(seq<S...>) {
// inserting task into stream
this->kfp<<<this->blocks, this->threads, 0, this->stream>>>(std::get<S>(params) ...);
if (DEBUG) printf("Task %d: Inserting Task in Stream.\n", this->id);
}
KernelTask(int id, void (*kfp)(Args...), std::tuple<Args...> params, dim3 threads, dim3 blocks);
public:
~KernelTask();
void copyToGPU();
void start() override {
callFunc(typename gens<sizeof...(Args)>::type());
}
__device__ void d_start() override {
d_callFunc(typename gens<sizeof...(Args)>::type());
}
static std::shared_ptr<KernelTask<Args...>> create(int id, void (*kfp)(Args...), std::tuple<Args...> params, dim3 threads, dim3 blocks);
};
class Scheduler {
private:
std::vector<std::shared_ptr<Task>> tasks;
public:
Scheduler(std::vector<std::shared_ptr<Task>> &tasks) {
this->tasks = tasks;
}
void runCPUScheduler();
void runGPUScheduler();
};
EDIT: 编辑:
(1) Virtual Functions in CUDA: I get a Warp Illegal Address
exception in scheduler
in the following example: (1)CUDA中的虚函数:在以下示例中,我在
scheduler
中得到了一个Warp Illegal Address
异常:
struct Base {
__host__ __device__ virtual void start() = 0;
virtual ~Base() {}
};
struct Derived : Base {
__host__ __device__ void start() override {
printf("In start\n");
}
};
__global__ void scheduler(Base *c) {
c->start();
}
int main(int argc, char **argv) {
Base *c = new Derived();
Base *d_c;
checkCudaErrors(cudaMalloc(&d_c, sizeof(Derived)));
checkCudaErrors(cudaMemcpy(d_c, c, sizeof(Derived), cudaMemcpyHostToDevice));
c->start();
scheduler<<<1,1>>>(d_c);
checkCudaErrors(cudaFree(d_c));
return 0;
}
(2) thrust::tuple
works fine. (2)
thrust::tuple
工作正常。
(3) I'm open to suggestions. (3)我愿意接受建议。
(4) How do I pass a kernel function pointer to a kernel? (4)如何将内核函数指针传递给内核? I get a
Warp Misaligned Address
exception in the following example: 我在以下示例中获得了
Warp Misaligned Address
异常:
__global__ void baz(int a, int b) {
printf("%d + %d = %d\n", a, b, a+b);
}
void schedulerHost(void (*kfp)(int, int)) {
kfp<<<1,1>>>(1,2);
}
__global__ void schedulerDevice(void (*kfp)(int, int)) {
kfp<<<1,1>>>(1,2);
}
int main(int argc, char **argv) {
schedulerHost(&baz);
schedulerDevice<<<1,1>>>(&baz);
return 0;
}
"Virtual functions are not allowed in CUDA. How can I avoid them without down casting?" “CUDA中不允许使用虚拟功能。如果不进行低估,我怎么能避免它们呢?”
You can have both virtual __host__
and __device__
functions: http://docs.nvidia.com/cuda/cuda-c-programming-guide/#virtual-functions 您可以同时拥有虚拟
__host__
和__device__
函数: http : __host__
However: 然而:
It is not allowed to pass as an argument to a
__global__
function an object of a class with virtual functions.不允许将
__global__
函数作为参数传递给具有虚函数的类的对象。
"std::get is a host function. Is there a way to implement std::get myself for the GPU? " “std :: get是一个主机函数。有没有办法为GPU实现std :: get? ”
I'd suggest using thrust::tuple
instead which has both a __host__
and a __device__
implementation: http://thrust.github.io/doc/group__tuple.html 我建议使用
thrust::tuple
,它同时具有__host__
和__device__
实现: http : __host__
Regarding function pointers: 关于函数指针:
The address of a
__global__
function taken in host code cannot be used in device code (eg to launch the kernel).主机代码中采用的
__global__
函数的地址不能用于设备代码(例如,启动内核)。
http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#function-pointers http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#function-pointers
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.