将函数指针及其参数作为推力::元组传递给全局函数

Question

I want to do the following: 我要执行以下操作：

#include <thrust/tuple.h>
#include <tuple>

template<typename... Args>
void someFunction(void (*fp)(Args...), thrust::tuple<Args...> params) {
}

void otherFunction(int n) {
}

int main(int argc, char **argv) {
    //// template argument deduction/substitution failed ////
    someFunction<int>(&otherFunction, thrust::make_tuple(1));
    return 0;
}

What I have tried: 我试过的

Removing one of the two parameter leads to a working solution of course. 当然，删除两个参数之一将导致可行的解决方案。
It works when I make someFunction a static function in a struct with template parameter. 当我在带有模板参数的struct中使someFunction成为静态函数时，它可以工作。 But in the original code someFunction is a CUDA kernel, so I can't do that. 但是在原始代码中， someFunction是CUDA内核，所以我不能这样做。 Any further ideas? 还有其他想法吗？
It works when I change thrust::tuple to std::tuple. 当我将推力:: tuple更改为std :: tuple时，它起作用。 Is there a way to construct a thrust::tuple out of a std::tuple? 有没有办法从std :: tuple构造推力:: tuple？

EDIT: 编辑：

To make it clearer: someFunction and otherFunction are __global__ ! 更清楚地说： someFunction和otherFunction是__global__ ！

#include <thrust/tuple.h>
#include <tuple>

template<typename... Args>
__global__ void someFunction(void (*fp)(Args...), thrust::tuple<Args...> params) {
}

__global__ void otherFunction(int n) {
}
__constant__ void (*kfp)(int) = &otherFunction;

int testPassMain(int argc, char **argv) {
    void (*h_kfp)(int);
    cudaMemcpyFromSymbol(&h_kfp, kfp, sizeof(void *), 0, cudaMemcpyDeviceToHost);
    someFunction<int><<<1,1>>>(h_kfp, thrust::make_tuple(1));
    return 0;
}

I get a compiler error: template argument deduction/substitution failed in both examples. 我收到一个编译器错误：在两个示例中， template argument deduction/substitution failed 。

Answer 1

Passing a function pointer and its parameters as a thrust::tuple to a global function 将函数指针及其参数作为推力::元组传递给全局函数

Something like this should be workable: 这样的事情应该是可行的：

$ cat t1161.cu
#include <thrust/tuple.h>
#include <stdio.h>

template <typename T, typename T1>
__global__ void kernel(void (*fp)(T1), T params){ // "someFunction"

  fp(thrust::get<0>(params));
  fp(thrust::get<1>(params));
}

__device__ void df(int n){                        // "otherFunction"

  printf("parameter = %d\n", n);
}

__device__ void (*ddf)(int) = df;

int main(){

  void (*hdf)(int);
  thrust::tuple<int, int> my_tuple = thrust::make_tuple(1,2);
  cudaMemcpyFromSymbol(&hdf, ddf, sizeof(void *));
  kernel<<<1,1>>>(hdf, my_tuple);
  cudaDeviceSynchronize();
}


$ nvcc -o t1161 t1161.cu
$ cuda-memcheck ./t1161
========= CUDA-MEMCHECK
parameter = 1
parameter = 2
========= ERROR SUMMARY: 0 errors
$

A similar methodology should also be workable if you intend df to be a __global__ function, you will just need to account properly for the dynamic parallelism case. 如果您打算将df用作__global__函数，则类似的方法也应该可行，您只需要适当考虑动态并行情况即可。 Likewise, only a slight variation on above should allow you to pass the tuple directly to the child function (ie df , whether device function or kernel). 同样，上面的一些细微变化应允许您将元组直接传递给子函数（即df ，无论是设备函数还是内核）。 It's not clear to me why you need variadic template arguments if your parameters are nicely packaged up in a thrust tuple. 我不清楚，如果您的参数很好地打包在推力元组中，为什么需要可变参数模板参数。

EDIT: If you can pass your tuple to the child kernel (I don't see why you wouldn't be able to, since according to your updated example the tuple and the child kernel share the same variadic parameter pack), then you may still be able to avoid variadic templates using this approach: 编辑：如果您可以将元组传递给子内核（我不明白为什么您不能这样做，因为根据更新的示例，元组和子内核共享相同的可变参数包），那么您可以仍然可以使用这种方法来避免可变参数模板：

$ cat t1162.cu
#include <thrust/tuple.h>
#include <stdio.h>

template<typename T>
__global__ void someFunction(void (*fp)(T), T params) {
  fp<<<1,1>>>(params);
  cudaDeviceSynchronize();
}

__global__ void otherFunction(thrust::tuple<int> t) {
  printf("param 0 = %d\n", thrust::get<0>(t));
}

__global__ void otherFunction2(thrust::tuple<float, float> t) {
  printf("param 1 = %f\n", thrust::get<1>(t));
}
__device__ void (*kfp)(thrust::tuple<int>) = &otherFunction;
__device__ void (*kfp2)(thrust::tuple<float, float>) = &otherFunction2;

int main(int argc, char **argv) {
    void (*h_kfp)(thrust::tuple<int>);
    void (*h_kfp2)(thrust::tuple<float, float>);
    cudaMemcpyFromSymbol(&h_kfp, kfp, sizeof(void *), 0, cudaMemcpyDeviceToHost);
    someFunction<<<1,1>>>(h_kfp, thrust::make_tuple(1));
    cudaDeviceSynchronize();
    cudaMemcpyFromSymbol(&h_kfp2, kfp2, sizeof(void *), 0, cudaMemcpyDeviceToHost);
    someFunction<<<1,1>>>(h_kfp2, thrust::make_tuple(0.5f, 1.5f));
    cudaDeviceSynchronize();
    return 0;
}
$ nvcc -arch=sm_35 -rdc=true -o t1162 t1162.cu -lcudadevrt
$ CUDA_VISIBLE_DEVICES="1" cuda-memcheck ./t1162
========= CUDA-MEMCHECK
param 0 = 1
param 1 = 1.500000
========= ERROR SUMMARY: 0 errors
$

In terms of functionality (being able to dispatch multiple child kernels with varying parameter packs) I don't see any difference in capability, again assuming your parameters are nicely packaged in a tuple. 在功能方面（能够分派具有不同参数包的多个子内核），在功能上没有任何区别，再次假设您的参数很好地打包在一个元组中。

Answer 2

A quick and dirty solution is to cast the function pointer: 一种快速而肮脏的解决方案是强制转换函数指针：

#include <thrust/tuple.h>
#include <tuple>

template<typename... Args>
__global__ void someFunction(void (*fp)(), thrust::tuple<Args...> params) {
    void (*kfp)(Args...) = (void (*)(Args...)) fp;
    kfp<<<1,1>>>(thrust::get<0>(params));
}

__global__ void otherFunction(int n) {
    printf("n = %d\n", n);
}
__constant__ void (*kfp)(int) = &otherFunction;

int testPassMain(int argc, char **argv) {
    void (*h_kfp)();
    cudaMemcpyFromSymbol(&h_kfp, kfp, sizeof(void *), 0, cudaMemcpyDeviceToHost);
    someFunction<int><<<1,1>>>(h_kfp, thrust::make_tuple(1));
    return 0;
}

I'm open to nicer solutions! 我愿意接受更好的解决方案！

将函数指针及其参数作为推力::元组传递给全局函数

问题描述

2 个解决方案

解决方案1
2 2016-06-22 16:27:04

解决方案2
0 2016-06-22 17:54:59

将函数指针及其参数作为推力::元组传递给全局函数

问题描述

2 个解决方案

解决方案1 2 2016-06-22 16:27:04

解决方案2 0 2016-06-22 17:54:59

解决方案1
2 2016-06-22 16:27:04

解决方案2
0 2016-06-22 17:54:59