Cuda內核返回向量

Question

我有一個單詞列表，我的目標是在一個非常長的短語中匹配每個單詞。 我在匹配每個單詞方面沒有問題，我唯一的問題是返回包含每個匹配信息的結構向量。

在代碼中：

typedef struct {
    int A, B, C; } Match;

__global__ void Find(veryLongPhrase * _phrase, Words * _word_list, vector<Match> * _matches)
{
    int a, b, c;

    [...] //Parallel search for each word in the phrase

    if(match) //When an occurrence is found
    {
        _matches.push_back(new Match{ A = a, B = b, C = c }); //Here comes the unknown, what should I do here???
    }
}

main()
{
    [...]

    veryLongPhrase * myPhrase = "The quick brown fox jumps over the lazy dog etc etc etc..."

    Words * wordList = {"the", "lazy"};

    vector<Match> * matches; //Obviously I can't pass a vector to a kernel

    Find<<< X, Y >>>(myPhrase, wordList, matches);

    [...]

}

我試過Thrust庫但沒有任何成功，你能建議我任何解決方案嗎？

非常感謝你。

Answer 1

這樣的東西應該工作（在瀏覽器中編碼，未經測試）：

// N is the maximum number of structs to insert
#define N 10000

typedef struct {
    int A, B, C; } Match;

__device__ Match dev_data[N];
__device__ int dev_count = 0;

__device__ int my_push_back(Match * mt) {
  int insert_pt = atomicAdd(&dev_count, 1);
  if (insert_pt < N){
    dev_data[insert_pt] = *mt;
    return insert_pt;}
  else return -1;}

__global__ void Find(veryLongPhrase * _phrase, Words * _word_list, vector<Match> * _matches)
{
    int a, b, c;

    [...] //Parallel search for each word in the phrase

    if(match) //When an occurrence is found
    {
        my_push_back(new Match{ A = a, B = b, C = c });    }
}


main()
{
    [...]

    veryLongPhrase * myPhrase = "The quick brown fox jumps over the lazy dog etc etc etc..."

    Words * wordList = {"the", "lazy"};

    Find<<< X, Y >>>(myPhrase, wordList);

    int dsize;
    cudaMemcpyFromSymbol(&dsize, dev_count, sizeof(int));
    vector<Match> results(dsize);
    cudaMemcpyFromSymbol(&(results[0]), dev_data, dsize*sizeof(Match));

    [...]

}

這將需要1.1或更高的計算能力用於原子操作。

nvcc -arch=sm_11 ...

這是一個有效的例子：

$ cat t347.cu
#include <iostream>
#include <vector>

// N is the maximum number of structs to insert
#define N 10000

typedef struct {
    int A, B, C; } Match;

__device__ Match dev_data[N];
__device__ int dev_count = 0;

__device__ int my_push_back(Match & mt) {
  int insert_pt = atomicAdd(&dev_count, 1);
  if (insert_pt < N){
    dev_data[insert_pt] = mt;
    return insert_pt;}
  else return -1;}

__global__ void Find()
{

    if(threadIdx.x < 10) //Simulate a found occurrence
    {
        Match a = { .A = 1, .B = 2, .C = 3 };
        my_push_back(a);    }
}


main()
{

    Find<<< 2, 256 >>>();

    int dsize;
    cudaMemcpyFromSymbol(&dsize, dev_count, sizeof(int));
    if (dsize >= N) {printf("overflow error\n"); return 1;}
    std::vector<Match> results(dsize);
    cudaMemcpyFromSymbol(&(results[0]), dev_data, dsize*sizeof(Match));
    std::cout << "number of matches = " << dsize << std::endl;
    std::cout << "A  =  " << results[dsize-1].A << std:: endl;
    std::cout << "B  =  " << results[dsize-1].B << std:: endl;
    std::cout << "C  =  " << results[dsize-1].C << std:: endl;

}
$ nvcc -arch=sm_11 -o t347 t347.cu
$ ./t347
number of matches = 20
A  =  1
B  =  2
C  =  3
$

請注意，在這種情況下，我的Match結果結構創建是不同的，我通過引用傳遞，但概念是相同的。

Cuda內核返回向量

問題描述

1 個解決方案

解決方案1
5 已采納 2014-02-14 20:10:07

Cuda內核返回向量

問題描述

1 個解決方案

解決方案1 5 已采納 2014-02-14 20:10:07

解決方案1
5 已采納 2014-02-14 20:10:07