Floyd Warshall algorithm in parallel using cuda

Question

I'm trying to implement Floyd Warshall algorithm using cuda but I'm having syncrhornization problem. This is my code:

__global__ void run_on_gpu(const int graph_size, int *output, int k) {
  int i = blockDim.y * blockIdx.y + threadIdx.y;
  int j = blockDim.x * blockIdx.x + threadIdx.x;

  if (D(i, k) + D(k, j) < D(i, j)) {
    D(i, j) = D(i, k) + D(k, j);
  }
}

void floyd_warshall_gpu(const int *graph, int graph_size, int *output) {
  int *dev_output;

  HANDLE_ERROR( cudaMalloc(&dev_output, sizeof(int) * graph_size * graph_size) );

  cudaMemcpy(dev_output, graph, sizeof(int) * graph_size * graph_size, cudaMemcpyHostToDevice);
  dim3 blocks(BLOCKS_PER_GRAPH_SIDE, BLOCKS_PER_GRAPH_SIDE, 1);
  dim3 threadsPerBlock(THREADS_PER_BLOCK_SIDE, THREADS_PER_BLOCK_SIDE, 1);
  int k;
  for (k = 0; k < graph_size; k++) {
    run_on_gpu<<<blocks, threadsPerBlock>>>(graph_size, dev_output, k);
  }
  cudaMemcpy(output, dev_output, sizeof(int) * graph_size * graph_size, cudaMemcpyDeviceToHost);

  cudaFree(dev_output);
}

This is my initial variables:

#define GRAPH_SIZE 2000

#define EDGE_COST(graph, graph_size, a, b) graph[a * graph_size + b]
#define D(a, b) EDGE_COST(output, graph_size, a, b)

#define INF 0x1fffffff

#define THREADS_PER_BLOCK_SIDE 16 // Each block have 16 * 16 = 256 threads
#define BLOCKS_PER_GRAPH_SIDE GRAPH_SIZE / THREADS_PER_BLOCK_SIDE

This is how I'm generating the graph:

void generate_random_graph(int *output, int graph_size) {
  int i, j;

  srand(0xdadadada);

  for (i = 0; i < graph_size; i++) {
    for (j = 0; j < graph_size; j++) {
      if (i == j) {
        D(i, j) = 0;
      }
      else {
        int r;
        r = rand() % 40;
        if (r > 20) {
          r = INF;
        }

        D(i, j) = r;
      }
    }
  }
}

When I set GRAPH_SIZE to a smaller number like 100 the result is incorrect.

I have written the algorithm sequentially on the cpu like the code bellow:

void floyd_warshall_cpu(const int *graph, int graph_size, int *output) {
  int i, j, k;

  memcpy(output, graph, sizeof(int) * graph_size * graph_size);

  for (k = 0; k < graph_size; k++) {
    for (i = 0; i < graph_size; i++) {
      for (j = 0; j < graph_size; j++) {
        if (D(i, k) + D(k, j) < D(i, j)) {
          D(i, j) = D(i, k) + D(k, j);
        }
      }
    }
  }
}

And I run and test it like this:

int main(int argc, char **argv) {
  int *graph, *output_cpu, *output_gpu;
  int size;
  size = sizeof(int) * GRAPH_SIZE * GRAPH_SIZE;
  graph = (int *)malloc(size);
  output_cpu = (int *)malloc(size);
  assert(output_cpu);
  memset(output_cpu, 0, size);
  output_gpu = (int *)malloc(size);
  generate_random_graph(graph, GRAPH_SIZE);
  floyd_warshall_cpu(graph, GRAPH_SIZE, output_cpu);
  floyd_warshall_gpu(graph, GRAPH_SIZE, output_gpu);
  if (memcmp(output_cpu, output_gpu, size) != 0) {
    fprintf(stderr, "FAIL!\n");
  }
  else {
    fprintf(stderr, "SUCCESS!\n");
  }
  free(graph);
  free(output_cpu);
  free(output_gpu);
  return 0;
}

Can anyone give me an ideia how to solve this?

Answer 1

The main problem I could find seems to be that your grid sizing is not done correctly.

With N=2000 and thread block side dimension of 16, that happens to be whole-number divisible. But if you reduce N to 100, it is not.

We can fix that by "rounding up" your grid dimensions:

#define BLOCKS_PER_GRAPH_SIDE ((GRAPH_SIZE+THREADS_PER_BLOCK_SIDE-1) / THREADS_PER_BLOCK_SIDE)

And adding a thread-check to your kernel:

  if ((i < graph_size) && (j < graph_size))

Here's a modified code that seems to run correctly for me:

$ cat t92.cu
#include <cstdio>
#include <cassert>


#define GRAPH_SIZE 100

#define EDGE_COST(graph, graph_size, a, b) graph[a * graph_size + b]
#define D(a, b) EDGE_COST(output, graph_size, a, b)

#define INF 0x1fffffff

#define THREADS_PER_BLOCK_SIDE 16
#define BLOCKS_PER_GRAPH_SIDE ((GRAPH_SIZE+THREADS_PER_BLOCK_SIDE-1) / THREADS_PER_BLOCK_SIDE)
#define HANDLE_ERROR(x) x



__global__ void run_on_gpu(const int graph_size, int *output, int k) {
  int i = blockDim.y * blockIdx.y + threadIdx.y;
  int j = blockDim.x * blockIdx.x + threadIdx.x;
  if ((i < graph_size) && (j < graph_size))
    if (D(i, k) + D(k, j) < D(i, j)) {
      D(i, j) = D(i, k) + D(k, j);
  }
}

void floyd_warshall_gpu(const int *graph, int graph_size, int *output) {
  int *dev_output;

  HANDLE_ERROR( cudaMalloc(&dev_output, sizeof(int) * graph_size * graph_size) );

  cudaMemcpy(dev_output, graph, sizeof(int) * graph_size * graph_size, cudaMemcpyHostToDevice);
  dim3 blocks(BLOCKS_PER_GRAPH_SIDE, BLOCKS_PER_GRAPH_SIDE, 1);
  dim3 threadsPerBlock(THREADS_PER_BLOCK_SIDE, THREADS_PER_BLOCK_SIDE, 1);
  int k;
  for (k = 0; k < graph_size; k++) {
    run_on_gpu<<<blocks, threadsPerBlock>>>(graph_size, dev_output, k);
  }
  cudaMemcpy(output, dev_output, sizeof(int) * graph_size * graph_size, cudaMemcpyDeviceToHost);

  cudaFree(dev_output);
}

void generate_random_graph(int *output, int graph_size) {
  int i, j;

  srand(0xdadadada);

  for (i = 0; i < graph_size; i++) {
    for (j = 0; j < graph_size; j++) {
      if (i == j) {
        D(i, j) = 0;
      }
      else {
        int r;
        r = rand() % 1000;
        if (r > 20) {
          D(i, j) = INF;
        }
        else
          D(i, j) = r+10;
      }
    }
  }
}

void floyd_warshall_cpu(const int *graph, int graph_size, int *output) {
  int i, j, k;

  memcpy(output, graph, sizeof(int) * graph_size * graph_size);

  for (k = 0; k < graph_size; k++) {
    for (i = 0; i < graph_size; i++) {
      for (j = 0; j < graph_size; j++) {
        if (D(i, k) + D(k, j) < D(i, j)) {
          D(i, j) = D(i, k) + D(k, j);
        }
      }
    }
  }
}

int main(int argc, char **argv) {
  int *graph, *output_cpu, *output_gpu;
  int size;
  size = sizeof(int) * GRAPH_SIZE * GRAPH_SIZE;
  graph = (int *)malloc(size);
  output_cpu = (int *)malloc(size);
  assert(output_cpu);
  memset(output_cpu, 0, size);
  output_gpu = (int *)malloc(size);
  generate_random_graph(graph, GRAPH_SIZE);
  floyd_warshall_cpu(graph, GRAPH_SIZE, output_cpu);
  floyd_warshall_gpu(graph, GRAPH_SIZE, output_gpu);
  if (memcmp(output_cpu, output_gpu, size) != 0) {
    fprintf(stderr, "FAIL!\n");
    int qq = 0;
    for (int i = 0; i < GRAPH_SIZE*GRAPH_SIZE; i++)
    if (output_cpu[i] != output_gpu[i]) {qq++; printf("i: %d, cpu: %d, gpu: %d\n",i, output_cpu[i], output_gpu[i]);}
    printf("# mismatches: %d\n", qq);
  }
  else {
    fprintf(stderr, "SUCCESS!\n");
  //  for (int i = 0; i < 100; i++)
  //   printf("i: %d, cpu: %d, gpu: %d\n",i, output_cpu[i], output_gpu[i]);
  }
  free(graph);
  free(output_cpu);
  free(output_gpu);
  return 0;
}
$ nvcc -o t92 t92.cu
$ vi t92.cu
$ cuda-memcheck ./t92
========= CUDA-MEMCHECK
SUCCESS!
========= ERROR SUMMARY: 0 errors
$

(I've modified your test case slightly as it was producing an output matrix that was mostly zero. )

Floyd Warshall algorithm in parallel using cuda

Question

1 answers

solution1
1 2020-11-27 20:58:01

Floyd Warshall algorithm in parallel using cuda

Question

1 answers

solution1 1 2020-11-27 20:58:01

solution1
1 2020-11-27 20:58:01