[英]Floyd Warshall algorithm in parallel using cuda
我正在尝试使用 cuda 实现 Floyd Warshall 算法,但我遇到了同步喇叭问题。 这是我的代码:
__global__ void run_on_gpu(const int graph_size, int *output, int k) {
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
if (D(i, k) + D(k, j) < D(i, j)) {
D(i, j) = D(i, k) + D(k, j);
}
}
void floyd_warshall_gpu(const int *graph, int graph_size, int *output) {
int *dev_output;
HANDLE_ERROR( cudaMalloc(&dev_output, sizeof(int) * graph_size * graph_size) );
cudaMemcpy(dev_output, graph, sizeof(int) * graph_size * graph_size, cudaMemcpyHostToDevice);
dim3 blocks(BLOCKS_PER_GRAPH_SIDE, BLOCKS_PER_GRAPH_SIDE, 1);
dim3 threadsPerBlock(THREADS_PER_BLOCK_SIDE, THREADS_PER_BLOCK_SIDE, 1);
int k;
for (k = 0; k < graph_size; k++) {
run_on_gpu<<<blocks, threadsPerBlock>>>(graph_size, dev_output, k);
}
cudaMemcpy(output, dev_output, sizeof(int) * graph_size * graph_size, cudaMemcpyDeviceToHost);
cudaFree(dev_output);
}
这是我的初始变量:
#define GRAPH_SIZE 2000
#define EDGE_COST(graph, graph_size, a, b) graph[a * graph_size + b]
#define D(a, b) EDGE_COST(output, graph_size, a, b)
#define INF 0x1fffffff
#define THREADS_PER_BLOCK_SIDE 16 // Each block have 16 * 16 = 256 threads
#define BLOCKS_PER_GRAPH_SIDE GRAPH_SIZE / THREADS_PER_BLOCK_SIDE
这就是我生成图表的方式:
void generate_random_graph(int *output, int graph_size) {
int i, j;
srand(0xdadadada);
for (i = 0; i < graph_size; i++) {
for (j = 0; j < graph_size; j++) {
if (i == j) {
D(i, j) = 0;
}
else {
int r;
r = rand() % 40;
if (r > 20) {
r = INF;
}
D(i, j) = r;
}
}
}
}
当我将 GRAPH_SIZE 设置为较小的数字(如 100)时,结果不正确。
我已经在 cpu 上按顺序编写了算法,如下面的代码:
void floyd_warshall_cpu(const int *graph, int graph_size, int *output) {
int i, j, k;
memcpy(output, graph, sizeof(int) * graph_size * graph_size);
for (k = 0; k < graph_size; k++) {
for (i = 0; i < graph_size; i++) {
for (j = 0; j < graph_size; j++) {
if (D(i, k) + D(k, j) < D(i, j)) {
D(i, j) = D(i, k) + D(k, j);
}
}
}
}
}
我像这样运行和测试它:
int main(int argc, char **argv) {
int *graph, *output_cpu, *output_gpu;
int size;
size = sizeof(int) * GRAPH_SIZE * GRAPH_SIZE;
graph = (int *)malloc(size);
output_cpu = (int *)malloc(size);
assert(output_cpu);
memset(output_cpu, 0, size);
output_gpu = (int *)malloc(size);
generate_random_graph(graph, GRAPH_SIZE);
floyd_warshall_cpu(graph, GRAPH_SIZE, output_cpu);
floyd_warshall_gpu(graph, GRAPH_SIZE, output_gpu);
if (memcmp(output_cpu, output_gpu, size) != 0) {
fprintf(stderr, "FAIL!\n");
}
else {
fprintf(stderr, "SUCCESS!\n");
}
free(graph);
free(output_cpu);
free(output_gpu);
return 0;
}
谁能给我一个想法如何解决这个问题?
我能找到的主要问题似乎是你的网格大小没有正确完成。
在 N=2000 和线程块边尺寸为 16 的情况下,这恰好是整数可整除。 但如果你将 N 减少到 100,则不是。
我们可以通过“四舍五入”您的网格尺寸来解决这个问题:
#define BLOCKS_PER_GRAPH_SIDE ((GRAPH_SIZE+THREADS_PER_BLOCK_SIDE-1) / THREADS_PER_BLOCK_SIDE)
并为您的 kernel 添加线程检查:
if ((i < graph_size) && (j < graph_size))
这是一个修改后的代码,对我来说似乎可以正确运行:
$ cat t92.cu
#include <cstdio>
#include <cassert>
#define GRAPH_SIZE 100
#define EDGE_COST(graph, graph_size, a, b) graph[a * graph_size + b]
#define D(a, b) EDGE_COST(output, graph_size, a, b)
#define INF 0x1fffffff
#define THREADS_PER_BLOCK_SIDE 16
#define BLOCKS_PER_GRAPH_SIDE ((GRAPH_SIZE+THREADS_PER_BLOCK_SIDE-1) / THREADS_PER_BLOCK_SIDE)
#define HANDLE_ERROR(x) x
__global__ void run_on_gpu(const int graph_size, int *output, int k) {
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
if ((i < graph_size) && (j < graph_size))
if (D(i, k) + D(k, j) < D(i, j)) {
D(i, j) = D(i, k) + D(k, j);
}
}
void floyd_warshall_gpu(const int *graph, int graph_size, int *output) {
int *dev_output;
HANDLE_ERROR( cudaMalloc(&dev_output, sizeof(int) * graph_size * graph_size) );
cudaMemcpy(dev_output, graph, sizeof(int) * graph_size * graph_size, cudaMemcpyHostToDevice);
dim3 blocks(BLOCKS_PER_GRAPH_SIDE, BLOCKS_PER_GRAPH_SIDE, 1);
dim3 threadsPerBlock(THREADS_PER_BLOCK_SIDE, THREADS_PER_BLOCK_SIDE, 1);
int k;
for (k = 0; k < graph_size; k++) {
run_on_gpu<<<blocks, threadsPerBlock>>>(graph_size, dev_output, k);
}
cudaMemcpy(output, dev_output, sizeof(int) * graph_size * graph_size, cudaMemcpyDeviceToHost);
cudaFree(dev_output);
}
void generate_random_graph(int *output, int graph_size) {
int i, j;
srand(0xdadadada);
for (i = 0; i < graph_size; i++) {
for (j = 0; j < graph_size; j++) {
if (i == j) {
D(i, j) = 0;
}
else {
int r;
r = rand() % 1000;
if (r > 20) {
D(i, j) = INF;
}
else
D(i, j) = r+10;
}
}
}
}
void floyd_warshall_cpu(const int *graph, int graph_size, int *output) {
int i, j, k;
memcpy(output, graph, sizeof(int) * graph_size * graph_size);
for (k = 0; k < graph_size; k++) {
for (i = 0; i < graph_size; i++) {
for (j = 0; j < graph_size; j++) {
if (D(i, k) + D(k, j) < D(i, j)) {
D(i, j) = D(i, k) + D(k, j);
}
}
}
}
}
int main(int argc, char **argv) {
int *graph, *output_cpu, *output_gpu;
int size;
size = sizeof(int) * GRAPH_SIZE * GRAPH_SIZE;
graph = (int *)malloc(size);
output_cpu = (int *)malloc(size);
assert(output_cpu);
memset(output_cpu, 0, size);
output_gpu = (int *)malloc(size);
generate_random_graph(graph, GRAPH_SIZE);
floyd_warshall_cpu(graph, GRAPH_SIZE, output_cpu);
floyd_warshall_gpu(graph, GRAPH_SIZE, output_gpu);
if (memcmp(output_cpu, output_gpu, size) != 0) {
fprintf(stderr, "FAIL!\n");
int qq = 0;
for (int i = 0; i < GRAPH_SIZE*GRAPH_SIZE; i++)
if (output_cpu[i] != output_gpu[i]) {qq++; printf("i: %d, cpu: %d, gpu: %d\n",i, output_cpu[i], output_gpu[i]);}
printf("# mismatches: %d\n", qq);
}
else {
fprintf(stderr, "SUCCESS!\n");
// for (int i = 0; i < 100; i++)
// printf("i: %d, cpu: %d, gpu: %d\n",i, output_cpu[i], output_gpu[i]);
}
free(graph);
free(output_cpu);
free(output_gpu);
return 0;
}
$ nvcc -o t92 t92.cu
$ vi t92.cu
$ cuda-memcheck ./t92
========= CUDA-MEMCHECK
SUCCESS!
========= ERROR SUMMARY: 0 errors
$
(我稍微修改了您的测试用例,因为它正在生成一个几乎为零的 output 矩阵。)
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.