使用Thrust排序静态分配的数组

Question

In my code, I have a statically allocated array in global memory (ie, allocated using __device__ ), which I want to sort using thrust::sort , which isn't working. 在我的代码中，我在全局内存中有一个静态分配的数组（即，使用__device__分配的数组），但我想使用thrust::sort ，但该数组不起作用。 All of the examples on this topic are using CUDA runtime allocated arrays (using cudaMalloc ). 关于该主题的所有示例都使用CUDA运行时分配的数组（使用cudaMalloc ）。 Is there any way I can sort a statically allocated array? 有什么办法可以对静态分配的数组进行排序？

I guess it has something to do with statically allocated memory not being accessible from the host. 我想这与无法从主机访问的静态分配内存有关。 Using cudaMalloc -allocated arrays, it is working fine. 使用cudaMalloc分配的数组，它可以正常工作。 However, I want to avoid using this type of allocation since static allocation allows for easier access to the data from device code (doesn't it?). 但是，我想避免使用这种类型的分配，因为静态分配可以更轻松地从设备代码访问数据（不是吗？）。

Minimal (not-) working example: 最小（非）工作示例：

#include <stdio.h>
#include <thrust/device_ptr.h>
#include <thrust/sort.h>

#define N 4

typedef struct element {
  int key;
  int value;
  __host__ __device__ bool operator<(element e) const
  { return key > e.key; }
} element;

__device__ element array[N];

__global__ void init() {
  for (int i = 0; i < N; ++i) {
    array[N - i - 1].key = i;
  }
}

__global__ void print_array() {
  for (int i = 0; i < N; ++i) {
    printf("%d ", array[i].key);
  }
  printf("\n");
}

int main(void) {
  thrust::device_ptr<element> array_first(array);

  init<<<1,1>>>();

  printf("unsorted: ");
  print_array<<<1, 1>>>();
  cudaDeviceSynchronize();

  thrust::sort(array_first, array_first + N);

  printf("sorted: ");
  print_array<<<1, 1>>>();
  cudaDeviceSynchronize();
}

Answer 1

Use cudaGetSymbolAddress to take the address of the array variable from a __host__ function: 使用cudaGetSymbolAddress从__host__函数获取array变量的地址：

void* array_ptr = 0;
cudaGetSymbolAddress(&array_ptr, array);
thrust::device_ptr<element> array_first(reinterpret_cast<element*>(array_ptr));

Here's the complete program: 这是完整的程序：

#include <stdio.h>
#include <thrust/device_ptr.h>
#include <thrust/sort.h>

#define N 4

typedef struct element {
  int key;
  int value;
  __host__ __device__ bool operator<(element e) const
  { return key > e.key; }
} element;

__device__ element array[N];

__global__ void init() {
  for (int i = 0; i < N; ++i) {
    array[N - i - 1].key = i;
  }
}

__global__ void print_array() {
  for (int i = 0; i < N; ++i) {
    printf("%d ", array[i].key);
  }
  printf("\n");
}

int main(void) {
  cudaError_t error;

  void* array_ptr = 0;
  if(error = cudaGetSymbolAddress(&array_ptr, array))
  {
    throw thrust::system_error(error, thrust::cuda_category());
  }

  thrust::device_ptr<element> array_first(reinterpret_cast<element*>(array_ptr));

  init<<<1,1>>>();

  printf("unsorted: ");
  print_array<<<1, 1>>>();

  if(error = cudaDeviceSynchronize())
  {
    throw thrust::system_error(error, thrust::cuda_category());
  }

  thrust::sort(array_first, array_first + N);

  if(error = cudaDeviceSynchronize())
  {
    throw thrust::system_error(error, thrust::cuda_category());
  }

  printf("sorted: ");
  print_array<<<1, 1>>>();

  if(error = cudaDeviceSynchronize())
  {
    throw thrust::system_error(error, thrust::cuda_category());
  }

  return 0;
}

Here's the output on my system: 这是我系统上的输出：

$ nvcc test.cu -run
unsorted: 3 2 1 0 
sorted: 3 2 1 0

The sorted output is the same as the unsorted output, but I guess that is intentional given the way the data is generated and the definition of element::operator< . 排序后的输出与未排序后的输出相同，但是考虑到生成数据的方式以及element::operator<的定义，我想这是故意的。

Answer 2

This: 这个：

__device__ element array[N];

...

thrust::device_ptr<element> array_first(array);

is illegal. 是非法的。 In host code, array is a host address and can't be passed to device code. 在主机代码中， array是主机地址，不能传递给设备代码。 Do something like this instead: 做这样的事情：

element* array_d;
cudaGetSymbolAddress((void **)&array_d, array);
thrust::device_ptr<element> array_first(array_d);

ie you need to use cudaGetSymbolAddress to read the address from the GPU context at runtime, then you can use the result of that call in GPU code. 也就是说，您需要在运行时使用cudaGetSymbolAddress从GPU上下文读取地址，然后可以在GPU代码中使用该调用的结果。

使用Thrust排序静态分配的数组

问题描述

Minimal (not-) working example: 最小（非）工作示例：

2 个解决方案

解决方案1
2 已采纳 2016-01-21 20:13:19

解决方案2
2

使用Thrust排序静态分配的数组

问题描述

Minimal (not-) working example: 最小（非）工作示例：

2 个解决方案

解决方案1 2 已采纳 2016-01-21 20:13:19

解决方案2 2

解决方案1
2 已采纳 2016-01-21 20:13:19

解决方案2
2