[英]How to do a reduction over one dimension of 2D data in Thrust

我是 CUDA 和推力库的新手。 我正在学习并尝试实现一个 function,它将有一个 for 循环执行推力 function。有没有办法将此循环转换为另一个推力 function? 或者我应该使用 CUDA kernel 来实现这个?


// thrust functor
struct GreaterthanX
    const float _x;
    GreaterthanX(float x) : _x(x) {}

    __host__ __device__ bool operator()(const float &a) const
        return a > _x;

int main(void)
    // fill a device_vector with
    // 3 2 4 5
    // 0 -2 3 1
    // 9 8 7 6
    int row = 3;
    int col = 4;
    thrust::device_vector<int> vec(row * col);
    thrust::device_vector<int> count(row);
    vec[0] = 3;
    vec[1] = 2;
    vec[2] = 4;
    vec[3] = 5;
    vec[4] = 0;
    vec[5] = -2;
    vec[6] = 3;
    vec[7] = 1;
    vec[8] = 9;
    vec[9] = 8;
    vec[10] = 7;
    vec[11] = 6;

    // Goal: For each row, count the number of elements greater than 2. 
    // And then find the row with the max count

    // count the element greater than 2 in vec
    for (int i = 0; i < row; i++)
        count[i] = thrust::count_if(vec.begin(), vec.begin() + i * col, GreaterthanX(2));

    thrust::device_vector<int>::iterator result = thrust::max_element(count.begin(), count.end());
    int max_val = *result;
    unsigned int position = result - count.begin();

    printf("result = %d at position %d\r\n", max_val, position);
    // result = 4 at position 2

    return 0;

我的目标是找到具有最多元素大于 2 的行。我正在努力研究如何在没有循环的情况下执行此操作。 任何建议将不胜感激。 谢谢。



为了优雅和可读性,我还自由地加入了一些constauto和 lambda 。 由于 lambda,您需要为nvcc使用-extended-lambda标志。

thrust::distance是减去 Thrust 迭代器的规范方法。

#include <cassert>
#include <cstdio>

#include <thrust/reduce.h>
#include <thrust/device_vector.h>
#include <thrust/distance.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/iterator/transform_iterator.h>

int main(void)
    // fill a device_vector with
    // 3 2 4 5
    // 0 -2 3 1
    // 9 8 7 6
    int const row = 3;
    int const col = 4;
    thrust::device_vector<int> vec(row * col);
    vec[0] = 3;
    vec[1] = 2;
    vec[2] = 4;
    vec[3] = 5;
    vec[4] = 0;
    vec[5] = -2;
    vec[6] = 3;
    vec[7] = 1;
    vec[8] = 9;
    vec[9] = 8;
    vec[10] = 7;
    vec[11] = 6;
    thrust::device_vector<int> count(row);

    // Goal: For each row, count the number of elements greater than 2. 
    // And then find the row with the max count

    // count the element greater than 2 in vec

    // counting iterator avoids read from global memory, gives index into vec
    auto keys_in_begin = thrust::make_counting_iterator(0);
    auto keys_in_end = thrust::make_counting_iterator(row * col);
    // transform vec on the fly
    auto vals_in_begin = thrust::make_transform_iterator(
        [] __device__ (int val) { return val > 2 ? 1 : 0; });
    // discard to avoid write to global memory
    auto keys_out_begin = thrust::make_discard_iterator();
    auto vals_out_begin = count.begin();
    // transform keys (indices) into row indices and then compare
    // the divisions are one reason one might rather
    // use MatX for higher dimensional data
    auto binary_predicate = [col] __device__ (int i, int j){
        return i / col == j / col;
    // this function returns a new end for count 
    // b/c the final number of elements is often not known beforehand
    auto new_ends = thrust::reduce_by_key(keys_in_begin, keys_in_end,
    // make sure that we didn't provide too small of an output vector
    assert(thrust::get<1>(new_ends) == count.end());

    auto const result = thrust::max_element(count.begin(), count.end());
    int const max_val = *result;
    auto const position = thrust::distance(count.begin(), result);

    std::printf("result = %d at position %d\r\n", max_val, position);
    // result = 4 at position 2

    return 0;

使用 MatX 的奖金解决方案

正如评论中提到的,NVIDIA 发布了一个名为MatX的新高级 C++17 库,它针对涉及(密集)多维数据(即张量)的问题。 该库试图在一个类似 python/matlab 的界面中统一多个低级库,如 CUFFT、CUSOLVER 和 CUTLASS。 在撰写本文时 (v0.2.2),库仍处于初始开发阶段,因此可能无法保证稳定的 API。因此,性能没有像更成熟的 Thrust 库和文档/示例并不十分详尽,MatX 不应在生产代码中使用。 在构建此解决方案时,我实际上偶然发现了一个立即修复的错误 所以这段代码只能在主分支上工作,而不适用于当前版本 v0.2.2,一些使用的功能可能还没有出现在文档中。

使用 MatX 的解决方案如下所示:

#include <iostream>
#include <matx.h>

int main(void)
    int const row = 3;
    int const col = 4;
    auto tensor = matx::make_tensor<int, 2>({row, col});
    tensor.SetVals({{3, 2, 4, 5},
                    {0, -2, 3, 1},
                    {9, 8, 7, 6}});
    // tensor.Print(0,0); // print full tensor

    auto count = matx::make_tensor<int, 1>({row});
    // count.Print(0); // print full count

    // Goal: For each row, count the number of elements greater than 2.
    // And then find the row with the max count

    // the kind of reduction is determined through the shapes of tensor and count
    matx::sum(count, matx::as_int(tensor > 2));

    // A single value (scalar) is a tensor of rank 0: 
    auto result_idx = matx::make_tensor<matx::index_t>();
    auto result = matx::make_tensor<int>();
    matx::argmax(result, result_idx, count);

    std::cout << "result = " << result() 
              << " at position " << result_idx() << "\r\n";
    // result = 4 at position 2

    return 0;

由于 MatX 使用延迟执行运算符, matx::as_int(tensor > 2)有效地融合到 kernel 中,实现与在 Thrust 中使用thrust::transform_iterator相同的效果。

由于 MatX 知道问题的规律性而 Thrust 不知道,因此 MatX 解决方案可能比 Thrust 解决方案性能更高。 它当然更优雅。 也可以在已分配的 memory 中构造张量,因此可以混合库,例如我通过将thrust::raw_pointer_cast(vec.data())传递给构造函数,在名为vecthrust::vector的 memory 中构造张量张量的。


