命名空間作為CUDA中的模板參數

Question

在C ++中，無法傳遞namespace因為無法將某種參數（通過模板或實際函數參數）傳遞給類或函數。 CUDA同樣適用（至少據我所知）。 這個問題解釋了一些原因：為什么命名空間不能成為模板參數？

這是一個用例示例：

namespace experiment1
{
    int repetitions() { return 2; }
    void setup() { ... }
    void f() { ... }
    void teardown() { ... }
}

namespace experiment2
{
    int repetitions() { return 4; }
    void setup() { ... }
    void f() { ... }
    void teardown() { ... }
}

// Beware, this is invalid C++ and invalid CUDA
template<namespace NS>
void do_test()
{
    // Do something with NS::repetitions(), NS::setup(), ...
}

為什么這在C ++中無效的原因之一是，這種方法沒有您不能對類做的事情。 實際上，您可以將每個命名空間都變成一個類，然后將這些函數變成成員函數，然后將該類作為模板參數傳遞給do_test函數，或者將其實例作為參數傳遞給同一函數（在前一種情況下可能使用靜態函數，或者在虛擬時使用在后一種情況下起作用）。

我同意這一點。 但是，在CUDA的特定情況下，您可以使用名稱空間執行某些操作，但不能使用類。 想象f是一個內核，即__global__函數，並且該setup或另一個函數用於指定（例如）要為內核分配的共享內存的大小。 內核不能是類的成員（請參閱此問題的答案： CUDA內核可以是虛擬函數嗎？）。 但是，您可以將它與與同一實驗相關的其他函數封裝在同一namespace 。

考慮上面代碼中所示的情況： do_test是用於設置計時器，准備一些輸入，檢查輸出，測量時間並執行其他一些操作的函數。 每個實驗都是由幾個具有相同名稱和接口的功能組成的集合，其中一個是內核。 您希望do_test具有足夠的通用性以處理所有這些實驗。 而且您希望每個實驗的代碼都以某種形式的封裝（例如名稱空間，結構，類等）獨立存在。

這個問題可以解決嗎？

根據talonmies的要求（順便說一句，非常感謝您的評論），我將使這個問題更加具體。

我有幾個非常簡單的內核，它們執行類似的操作。 它們將值加載到一個大數組中，對它們執行模板操作，然后將結果寫入輸出數組（與輸入數組不同）。 模板操作是指線程idx對輸入值idx及其相鄰值（例如從idx-3到idx+3 ）執行的操作。 這些內核中最簡單的內核僅執行從輸入到輸出的復制：每個線程讀取input[idx]並寫入output[idx] 。 另一個示例是執行output[idx] = input[idx+1] - input[idx-1]的差異模具。 （我省略了一些細節，但是您明白了。）

我想對這些內核進行基准測試，以便得出性能模型。 對於每個內核，我還需要一個能夠檢查結果的宿主函數。 在每種情況下，我還都具有另一個內核，該內核通過優化以略有不同的方式執行相同的操作，但從結果的角度來看是等效的。 最后，我有一個宿主函數，可打印內核名稱。 這是代碼摘要：

namespace copy
{
    std::string name() { return "copy"; }
    __global__ void kernel(const float* input, float* output, int size);
    __global__ void kernelOptimized(const float* input, float* output, int size);
    bool check(const float* input, const float* output);
}

namespace difference
{
    std::string name() { return "difference"; }
    __global__ void kernel(const float* input, float* output, int size);
    __global__ void kernelOptimized(const float* input, float* output, int size);
    bool check(const float* input, const float* output);
}

我有一個函數do_test ，我將其參數do_test通用的：

typedef bool NameFunction(const float* input, const float* output);
typedef bool CheckFunction(const float* input, const float* output);
typedef void KernelFunction(const float* input, float* output, int size);

void do_test(NameFunction name, KernelFunction kernel1, KernelFunction kernel2, CheckFunction check)
{
    // Set up input and output array
    // Set up CUDA events
    // Warm up kernels
    // Run kernels
    // Check results
    // Measure time
    // Do standard output
}

int main()
{
    do_test<copy::name, copy::kernel, copy::kernelOptimized, copy::check>()
    do_test<difference::name, difference::kernel, difference::kernelOptimized, difference::check>()
}

現在，這種方式當然已經相當不錯了。 但是，如果我引入了每個實驗都必須提供的另一個功能，則需要在調用do_test地方修改所有這些行。 我希望傳遞此名稱空間或包含這些功能的某種對象。

Answer 1

您可以將內核修改為__device__函數，然后通過kernel_wrapper調用該kernel_wrapper ：

#include <iostream>
#include <stdio.h>


typedef void (*kernel_ptr)(const float* input, float* output, int size);

template <kernel_ptr kernel>
__global__
void kernel_wrapper(const float* input, float* output, int size)
{
    kernel(input, output, size);
}

struct copy
{
    std::string name() { return "copy"; }
    __device__ static void kernel(const float* input, float* output, int size){ printf("copy: %d\n",threadIdx.x); }
    __device__ static void kernelOptimized(const float* input, float* output, int size){ printf("copy optimized: %d\n",threadIdx.x); }
};

struct difference
{
    std::string name() { return "difference"; }

    __device__ static void kernel(const float* input, float* output,i nt size){ printf("difference: %d\n",threadIdx.x); }
    __device__ static void kernelOptimized(const float* input, float* output, int size){ printf("difference optimized: %d\n",threadIdx.x); }
};

template <typename Experiment>
void do_test()
{
    dim3 dimBlock( 4, 1 );
    dim3 dimGrid( 1, 1 );
    Experiment e;

    std::cout << "running experiment " << e.name() << std::endl;
    std::cout << "launching the normal kernel" << std::endl;
    kernel_wrapper<Experiment::kernel><<<dimGrid, dimBlock>>>(0,0,0);
    cudaDeviceSynchronize();
    std::cout << "launching the optimized kernel" << std::endl;
    kernel_wrapper<Experiment::kernelOptimized><<<dimGrid, dimBlock>>>(0,0,0);
    cudaDeviceSynchronize();
}


int main()
{
    do_test<copy>();
    do_test<difference>();
    return 0;
}

輸出：

running experiment copy
launching the normal kernel
copy: 0
copy: 1
copy: 2
copy: 3
launching the optimized kernel
copy optimized: 0
copy optimized: 1
copy optimized: 2
copy optimized: 3
running experiment difference
launching the normal kernel
difference: 0
difference: 1
difference: 2
difference: 3
launching the optimized kernel
difference optimized: 0
difference optimized: 1
difference optimized: 2
difference optimized: 3

另外，您可以結合使用CRTP和模板專業化：

#include <iostream>
#include <stdio.h>


template <typename Experiment>
__global__ void f();

template <typename Derived>
struct experiment
{
    void run()
    {
        int blocksize = static_cast<Derived*>(this)->blocksize();
        int reps = static_cast<Derived*>(this)->repetitions();
        for (int i = 0; i<reps; ++i)
        {
            dim3 dimBlock( blocksize, 1 );
            dim3 dimGrid( 1, 1 );
            f<Derived><<<dimGrid, dimBlock>>>();
        }
        cudaDeviceSynchronize();
    }
};

struct experiment1 : experiment<experiment1>
{
    int repetitions() { return 2; }
    int blocksize() { return 4; }
    experiment1() { std::cout << "setting up experiment 1" << std::endl; }
    ~experiment1() {  std::cout << "shutting down experiment 1" << std::endl;  }
};

template <>
__global__
void f<experiment1>()
{
    printf("experiment1: %d\n",threadIdx.x);
}


struct experiment2 : experiment<experiment2>
{
    int repetitions() { return 4; }
    int blocksize() { return 2; }
    experiment2() { std::cout << "setting up experiment 2" << std::endl; }
    ~experiment2() {  std::cout << "shutting down experiment 2" << std::endl;  }
};

template <>
__global__
void f<experiment2>()
{
    printf("experiment2: %d\n",threadIdx.x);
}

template<typename Experiment>
void do_test()
{
    Experiment e;
    e.run();
}

#include <iostream>
#include <stdio.h>


template <typename Experiment>
__global__ void f();

template <typename Derived>
struct experiment
{
    void run()
    {
        int blocksize = static_cast<Derived*>(this)->blocksize();
        int reps = static_cast<Derived*>(this)->repetitions();
        for (int i = 0; i<reps; ++i)
        {
            dim3 dimBlock( blocksize, 1 );
            dim3 dimGrid( 1, 1 );
            f<Derived><<<dimGrid, dimBlock>>>();
        }
        cudaDeviceSynchronize();
    }
};

struct experiment1 : experiment<experiment1>
{
    int repetitions() { return 2; }
    int blocksize() { return 4; }
    experiment1() { std::cout << "setting up experiment 1" << std::endl; }
    ~experiment1() {  std::cout << "shutting down experiment 1" << std::endl;  }
};

template <>
__global__
void f<experiment1>()
{
    printf("experiment1: %d\n",threadIdx.x);
}


struct experiment2 : experiment<experiment2>
{
    int repetitions() { return 4; }
    int blocksize() { return 2; }
    experiment2() { std::cout << "setting up experiment 2" << std::endl; }
    ~experiment2() {  std::cout << "shutting down experiment 2" << std::endl;  }
};

template <>
__global__
void f<experiment2>()
{
    printf("experiment2: %d\n",threadIdx.x);
}

template<typename Experiment>
void do_test()
{
    Experiment e;
    e.run();
}

int main()
{
    do_test<experiment1>();
    do_test<experiment2>();
    return 0;
}

產量

setting up experiment 1
experiment1: 0
experiment1: 1
experiment1: 2
experiment1: 3
experiment1: 0
experiment1: 1
experiment1: 2
experiment1: 3
shutting down experiment 1
setting up experiment 2
experiment2: 0
experiment2: 1
experiment2: 0
experiment2: 1
experiment2: 0
experiment2: 1
experiment2: 0
experiment2: 1
shutting down experiment 2

命名空間作為CUDA中的模板參數

問題描述

1 個解決方案

解決方案1
2 已采納 2015-08-10 11:36:54

命名空間作為CUDA中的模板參數

問題描述

1 個解決方案

解決方案1 2 已采納 2015-08-10 11:36:54

解決方案1
2 已采納 2015-08-10 11:36:54