简体   繁体   English

Cuda内核返回向量

[英]Cuda kernel returning vectors

I have a list of words, my goal is to match each word in a very very long phrase. 我有一个单词列表,我的目标是在一个非常长的短语中匹配每个单词。 I'm having no problem in matching each word, my only problem is to return a vector of structures containing informations about each match. 我在匹配每个单词方面没有问题,我唯一的问题是返回包含每个匹配信息的结构向量。

In code: 在代码中:

typedef struct {
    int A, B, C; } Match;

__global__ void Find(veryLongPhrase * _phrase, Words * _word_list, vector<Match> * _matches)
{
    int a, b, c;

    [...] //Parallel search for each word in the phrase

    if(match) //When an occurrence is found
    {
        _matches.push_back(new Match{ A = a, B = b, C = c }); //Here comes the unknown, what should I do here???
    }
}

main()
{
    [...]

    veryLongPhrase * myPhrase = "The quick brown fox jumps over the lazy dog etc etc etc..."

    Words * wordList = {"the", "lazy"};

    vector<Match> * matches; //Obviously I can't pass a vector to a kernel

    Find<<< X, Y >>>(myPhrase, wordList, matches);

    [...]

}

I have tried Thrust library but without any success, can you suggest me any kind of solution? 我试过Thrust库但没有任何成功,你能建议我任何解决方案吗?

Thank you very much. 非常感谢你。

something like this should work (coded in browser, not tested): 这样的东西应该工作(在浏览器中编码,未经测试):

// N is the maximum number of structs to insert
#define N 10000

typedef struct {
    int A, B, C; } Match;

__device__ Match dev_data[N];
__device__ int dev_count = 0;

__device__ int my_push_back(Match * mt) {
  int insert_pt = atomicAdd(&dev_count, 1);
  if (insert_pt < N){
    dev_data[insert_pt] = *mt;
    return insert_pt;}
  else return -1;}

__global__ void Find(veryLongPhrase * _phrase, Words * _word_list, vector<Match> * _matches)
{
    int a, b, c;

    [...] //Parallel search for each word in the phrase

    if(match) //When an occurrence is found
    {
        my_push_back(new Match{ A = a, B = b, C = c });    }
}


main()
{
    [...]

    veryLongPhrase * myPhrase = "The quick brown fox jumps over the lazy dog etc etc etc..."

    Words * wordList = {"the", "lazy"};

    Find<<< X, Y >>>(myPhrase, wordList);

    int dsize;
    cudaMemcpyFromSymbol(&dsize, dev_count, sizeof(int));
    vector<Match> results(dsize);
    cudaMemcpyFromSymbol(&(results[0]), dev_data, dsize*sizeof(Match));

    [...]

}

This will require compute capability 1.1 or better for the atomic operation. 这将需要1.1或更高的计算能力用于原子操作。

nvcc -arch=sm_11 ...

Here's a worked example: 这是一个有效的例子:

$ cat t347.cu
#include <iostream>
#include <vector>

// N is the maximum number of structs to insert
#define N 10000

typedef struct {
    int A, B, C; } Match;

__device__ Match dev_data[N];
__device__ int dev_count = 0;

__device__ int my_push_back(Match & mt) {
  int insert_pt = atomicAdd(&dev_count, 1);
  if (insert_pt < N){
    dev_data[insert_pt] = mt;
    return insert_pt;}
  else return -1;}

__global__ void Find()
{

    if(threadIdx.x < 10) //Simulate a found occurrence
    {
        Match a = { .A = 1, .B = 2, .C = 3 };
        my_push_back(a);    }
}


main()
{

    Find<<< 2, 256 >>>();

    int dsize;
    cudaMemcpyFromSymbol(&dsize, dev_count, sizeof(int));
    if (dsize >= N) {printf("overflow error\n"); return 1;}
    std::vector<Match> results(dsize);
    cudaMemcpyFromSymbol(&(results[0]), dev_data, dsize*sizeof(Match));
    std::cout << "number of matches = " << dsize << std::endl;
    std::cout << "A  =  " << results[dsize-1].A << std:: endl;
    std::cout << "B  =  " << results[dsize-1].B << std:: endl;
    std::cout << "C  =  " << results[dsize-1].C << std:: endl;

}
$ nvcc -arch=sm_11 -o t347 t347.cu
$ ./t347
number of matches = 20
A  =  1
B  =  2
C  =  3
$

Note that in this case my Match result struct creation is different, and I am passing by reference, but the concept is the same. 请注意,在这种情况下,我的Match结果结构创建是不同的,我通过引用传递,但概念是相同的。

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM