CUDA 7.5实验host device lambdas

Question

I played a bit with the experimental device lambdas that where introduced in CUDA 7.5 and promoted in this blog post by Mark Harris . 我玩了一些实验设备lambdas ，它在CUDA 7.5中引入并在Mark Harris的博客文章中进行了推广。

For the following example I removed a lot of stuff that is not needed to show my problem (my actual implementation looks a bit nicer...). 对于以下示例，我删除了许多不需要显示我的问题的东西（我的实际实现看起来更好......）。

I tried to write a foreach function that operates either on vectors on device (1 thread per element) or host (serial) depending on a template parameter. 我尝试编写一个foreach函数，该函数可以在设备上的向量（每个元素1个线程）或主机（串行）上运行，具体取决于模板参数。 With this foreach function I can easily implement BLAS functions. 有了这个foreach函数，我可以轻松实现BLAS函数。 As an example I use assigning a scalar to each component of a vector (I attach the complete code in the end): 作为一个例子，我使用向量的每个组件分配一个标量（我最后附上完整的代码）：

template<bool onDevice> void assignScalar( size_t size, double* vector, double a )
{
    auto assign = [=] __host__ __device__ ( size_t index ) { vector[index] = a; };
    if( onDevice )
    {
        foreachDevice( size, assign );
    }
    else
    {
        foreachHost( size, assign );
    }
}

However, this code gives a compiler error because of the __host__ __device__ lambda: 但是，由于__host__ __device__ lambda，此代码会产生编译器错误：

The closure type for a lambda ("lambda ->void") cannot be used in the template argument type of a __global__ function template instantiation, unless the lambda is defined within a __device__ or __global__ function lambda（“lambda - > void”）的闭包类型不能在__global__函数模板实例化的模板参数类型中使用，除非lambda在__device__或__global__函数中定义

I get the same error if I remove the __device__ from the lambda expression and I get no compile error if I remove __host__ (only __device__ lambda), but in this case the host part is not executed... 如果我从lambda表达式中删除__device__并且如果我删除__host__ （仅__device__ lambda）我没有编译错误，我得到相同的错误，但在这种情况下主机部分没有执行...

If I define the lambda as either __host__ or __device__ separately, the code compiles and works as expected. 如果我将lambda分别定义为__host__或__device__ ，则代码将按预期编译并运行。

template<bool onDevice> void assignScalar2( size_t size, double* vector, double a )
{
    if( onDevice )
    {
        auto assign = [=] __device__ ( size_t index ) { vector[index] = a; };
        foreachDevice( size, assign );
    }
    else
    {
        auto assign = [=] __host__ ( size_t index ) { vector[index] = a; };
        foreachHost( size, assign );
    }
}

However, this introduces code duplication and actually makes the whole idea of using lambdas useless for this example. 但是，这引入了代码重复，实际上使得使用lambdas的整个想法对于这个例子毫无用处。

Is there a way to accomplish what I want to do or is this a bug in the experimental feature? 有没有办法完成我想要做的事情，或者这是实验性功能中的错误？ Actually, defining a __host__ __device__ lambda is explicitly mentioned in the first example in the programming guide . 实际上，在编程指南的第一个示例中明确提到了定义__host__ __device__ lambda。 Even for that simpler example (just return a constant value from the lambda) I couldn't find a way to use the lambda expression on both host and device. 即使对于那个更简单的例子（只是从lambda返回一个常量值），我找不到在主机和设备上使用lambda表达式的方法。

Here is the full code, compile with options -std=c++11 --expt-extended-lambda : 这是完整的代码，用选项-std=c++11 --expt-extended-lambda编译：

#include <iostream>
using namespace std;

template<typename Operation> void foreachHost( size_t size, Operation o )
{
    for( size_t i = 0; i < size; ++i )
    {
        o( i );
    }
}

template<typename Operation> __global__ void kernel_foreach( Operation o )
{
    size_t index = blockIdx.x * blockDim.x + threadIdx.x;
    o( index );
}

template<typename Operation> void foreachDevice( size_t size, Operation o )
{
    size_t blocksize = 32;
    size_t gridsize = size/32;
    kernel_foreach<<<gridsize,blocksize>>>( o );
}

__global__ void printFirstElementOnDevice( double* vector )
{
    printf( "dVector[0] = %f\n", vector[0] );
}

void assignScalarHost( size_t size, double* vector, double a )
{
    auto assign = [=] ( size_t index ) { vector[index] = a; };
    foreachHost( size, assign );
}

void assignScalarDevice( size_t size, double* vector, double a )
{
    auto assign = [=] __device__ ( size_t index ) { vector[index] = a; };
    foreachDevice( size, assign );
}

// compile error:
template<bool onDevice> void assignScalar( size_t size, double* vector, double a )
{
    auto assign = [=]  __host__ __device__ ( size_t index ) { vector[index] = a; };
    if( onDevice )
    {
        foreachDevice( size, assign );
    }
    else
    {
        foreachHost( size, assign );
    }
}

// works:
template<bool onDevice> void assignScalar2( size_t size, double* vector, double a )
{
    if( onDevice )
    {
        auto assign = [=] __device__ ( size_t index ) { vector[index] = a; };
        foreachDevice( size, assign );
    }
    else
    {
        auto assign = [=] __host__ ( size_t index ) { vector[index] = a; };
        foreachHost( size, assign );
    }
}

int main()
{
    size_t SIZE = 32;

    double* hVector = new double[SIZE];
    double* dVector;
    cudaMalloc( &dVector, SIZE*sizeof(double) );

    // clear memory
    for( size_t i = 0; i < SIZE; ++i )
    {
        hVector[i] = 0;
    }
    cudaMemcpy( dVector, hVector, SIZE*sizeof(double), cudaMemcpyHostToDevice );

    assignScalarHost( SIZE, hVector, 1.0 );
    cout << "hVector[0] = " << hVector[0] << endl;

    assignScalarDevice( SIZE, dVector, 2.0 );
    printFirstElementOnDevice<<<1,1>>>( dVector );
    cudaDeviceSynchronize();

    assignScalar2<false>( SIZE, hVector, 3.0 );
    cout << "hVector[0] = " << hVector[0] << endl;

    assignScalar2<true>( SIZE, dVector, 4.0 );
    printFirstElementOnDevice<<<1,1>>>( dVector );
    cudaDeviceSynchronize();

//  assignScalar<false>( SIZE, hVector, 5.0 );
//  cout << "hVector[0] = " << hVector[0] << endl;
//
//  assignScalar<true>( SIZE, dVector, 6.0 );
//  printFirstElementOnDevice<<<1,1>>>( dVector );
//  cudaDeviceSynchronize();

    cudaError_t error = cudaGetLastError();
    if(error!=cudaSuccess)
    {
        cout << "ERROR: " << cudaGetErrorString(error);
    }
}

I used the production release of CUDA 7.5. 我使用了CUDA 7.5的生产版本。

Update 更新

I tried this third version for the assignScalar function: 我尝试了第三个版本的assignScalar函数：

template<bool onDevice> void assignScalar3( size_t size, double* vector, double a )
{
#ifdef __CUDA_ARCH__
#define LAMBDA_HOST_DEVICE __device__
#else
#define LAMBDA_HOST_DEVICE __host__
#endif

    auto assign = [=] LAMBDA_HOST_DEVICE ( size_t index ) { vector[index] = a; };
    if( onDevice )
    {
        foreachDevice( size, assign );
    }
    else
    {
        foreachHost( size, assign );
    }
}

It compiles and runs without error, but the device version ( assignScalar3<true> ) is not executed. 它编译并运行时没有错误，但不执行设备版本（ assignScalar3<true> ）。 Actually, I thought that __CUDA_ARCH__ will always be undefined (since the function is not __device__ ) but I checked explicitly that there is a compile path where it is defined. 实际上，我认为__CUDA_ARCH__将始终是未定义的（因为函数不是__device__ ）但我明确检查了存在定义它的编译路径。

Answer 1

The task that I tried to accomplish with the examples provided in the question is not possible with CUDA 7.5 , though it was not explicitly excluded from the allowed cases for the experimental lambda support. 我尝试使用CUDA 7.5中提供的示例尝试完成的任务是不可能的 ，尽管它没有明确地排除在实验性lambda支持的允许情况之外。

NVIDIA announced that CUDA Toolkit 8.0 will support __host__ __device__ lambdas as an experimental feature, according to the blog post CUDA 8 Features Revealed . 根据CUDA 8 Features Revealed的博客文章，NVIDIA宣布CUDA Toolkit 8.0将支持__host__ __device__ lambdas作为实验性功能。

I verified that my example works with the CUDA 8 Release Candidate (Cuda compilation tools, release 8.0, V8.0.26). 我验证了我的示例适用于CUDA 8 Release Candidate（Cuda编译工具，8.0版，V8.0.26）。

Here is the code that I finally used, compiled with nvcc -std=c++11 --expt-extended-lambda : 这是我最终使用的代码，使用nvcc -std=c++11 --expt-extended-lambda编译：

#include <iostream>
using namespace std;

template<typename Operation> __global__ void kernel_foreach( Operation o )
{
    size_t i = blockIdx.x * blockDim.x + threadIdx.x;
    o( i );
}

template<bool onDevice, typename Operation> void foreach( size_t size, Operation o )
{
    if( onDevice )
    {
        size_t blocksize = 32;
        size_t gridsize = size/32;
        kernel_foreach<<<gridsize,blocksize>>>( o );
    }
    else
    {
        for( size_t i = 0; i < size; ++i )
        {
            o( i );
        }
    }
}

__global__ void printFirstElementOnDevice( double* vector )
{
    printf( "dVector[0] = %f\n", vector[0] );
}

template<bool onDevice> void assignScalar( size_t size, double* vector, double a )
{
    auto assign = [=]  __host__ __device__ ( size_t i ) { vector[i] = a; };
    foreach<onDevice>( size, assign );
}

int main()
{
    size_t SIZE = 32;

    double* hVector = new double[SIZE];
    double* dVector;
    cudaMalloc( &dVector, SIZE*sizeof(double) );

    // clear memory
    for( size_t i = 0; i < SIZE; ++i )
    {
        hVector[i] = 0;
    }
    cudaMemcpy( dVector, hVector, SIZE*sizeof(double), cudaMemcpyHostToDevice );

    assignScalar<false>( SIZE, hVector, 3.0 );
    cout << "hVector[0] = " << hVector[0] << endl;

    assignScalar<true>( SIZE, dVector, 4.0 );
    printFirstElementOnDevice<<<1,1>>>( dVector );
    cudaDeviceSynchronize();

    cudaError_t error = cudaGetLastError();
    if(error!=cudaSuccess)
    {
        cout << "ERROR: " << cudaGetErrorString(error);
    }
}

CUDA 7.5实验host device lambdas

问题描述

1 个解决方案

解决方案1
3 已采纳 2016-04-06 07:55:54

CUDA 7.5实验__host__ __device__ lambdas

问题描述

1 个解决方案

解决方案1 3 已采纳 2016-04-06 07:55:54

CUDA 7.5实验host device lambdas

解决方案1
3 已采纳 2016-04-06 07:55:54