在 OpenCl 中，多个 gpu 比单个 gpu 慢。我怎样才能做得更快？

Question

I made vector addition kernel and run it in the single gpu and multiple gpu.我制作了矢量加法内核并在单个 gpu 和多个 gpu 中运行它。 However in Multi gpu case is much slower than single gpu in the same length of vector addition.然而，在相同长度的向量加法中，多 gpu 的情况比单 gpu 慢得多。

The structure of my code is one context one kernel and multi queues which has same number of devices.. How can I modify for faster in multi gpu case?我的代码的结构是一个上下文，一个内核和多个具有相同设备数量的队列。如何在多 GPU 情况下进行更快的修改？

The code is below代码如下

#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <unistd.h>
#include <CL/cl.h>
#include <math.h>

//#define VECTOR_SIZE 640000
//#define LOCAL_SIZE 64

#define CHECK_ERROR(err) \
  if (err != CL_SUCCESS) { \
    printf("[%s:%d] OpenCL error %d\n", __FILE__, __LINE__, err); \
    exit(EXIT_FAILURE); \
  }

double get_time() {
  struct timeval tv;
  gettimeofday(&tv, NULL);
  return (double)tv.tv_sec + (double)1e-6 * tv.tv_usec;
}

char *get_source_code(const char *file_name, size_t *len) {
  char *source_code;
  size_t length;
  FILE *file = fopen(file_name, "r");
  if (file == NULL) {
    printf("[%s:%d] Failed to open %s\n", __FILE__, __LINE__, file_name);
    exit(EXIT_FAILURE);
  }

  fseek(file, 0, SEEK_END);
  length = (size_t)ftell(file);
  rewind(file);

  source_code = (char *)malloc(length + 1);
  fread(source_code, length, 1, file);
  source_code[length] = '\0';

  fclose(file);

  *len = length;
  return source_code;
}

int main() {
  // OpenCl variables
  cl_platform_id platform;
  //cl_device_id device;
  cl_device_id *devices;
  cl_device_id device_temp;

  cl_context context;
  //cl_command_queue queue;
  cl_command_queue *queues;

  cl_mem bufferA, bufferB, bufferC;
  cl_program program;
  char *kernel_source;
  size_t kernel_source_size;
  
  cl_kernel kernel;
  //cl_kernel *kernels;

  cl_int err;

  //
  
  
  size_t VECTOR_SIZE = 64000000 ;
  int num_devices = 4;
  size_t LOCAL_SIZE = 64;
  
  // Time variables
  double start;
  double end;

  // Get platform
  err = clGetPlatformIDs(1, &platform, NULL);
  CHECK_ERROR(err);

  // Get GPU device
  
  devices = (cl_device_id *) malloc(sizeof(cl_device_id)*num_devices);
  err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, num_devices, devices, NULL);
  //err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
  CHECK_ERROR(err);

  // Create context
  context = clCreateContext(NULL,num_devices, devices , NULL, NULL , &err);
  //context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
  CHECK_ERROR(err);

  // Get kernel code
  kernel_source = get_source_code("kernel.cl", &kernel_source_size);

  // Create program
  program = clCreateProgramWithSource(context, 1, (const char**)&kernel_source,
    &kernel_source_size, &err);
  CHECK_ERROR(err);

  // Build program
  err = clBuildProgram(program, num_devices, devices, "", NULL, NULL);
  
  if(err == CL_BUILD_PROGRAM_FAILURE) {
    size_t log_size;
    char *log;

    // Get program build
    //err = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
    //  0, NULL, &log_size);
    err = clGetProgramBuildInfo(program,devices[0],CL_PROGRAM_BUILD_LOG,0,NULL,&log_size);
    CHECK_ERROR(err);
    
    // Get build log
    log = (char*)malloc(log_size + 1);
    //err = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 
    //  log_size, log, NULL);
    err = clGetProgramBuildInfo(program,devices[0],CL_PROGRAM_BUILD_LOG,log_size,log,NULL);
    CHECK_ERROR(err);

    log[log_size] = '\0';
    printf("Compiler error : \n%s\n", log);
    free(log);
    exit(0);
  }
  CHECK_ERROR(err);
  // Create Vector A, B, C
  float *A = (float*)malloc(sizeof(float) * VECTOR_SIZE);
  float *B = (float*)malloc(sizeof(float) * VECTOR_SIZE);
  float *C = (float*)malloc(sizeof(float) * VECTOR_SIZE);

  // Initial Vector A, B
  //cl_ushort idx;
  /*for(idx = 0; idx < VECTOR_SIZE; idx++) {
    A[idx] = rand() % 100;
    B[idx] = rand() % 100;
  }*/
  printf("start\n");
  start = get_time();
  for(int i = 0; i <VECTOR_SIZE; i++){
      A[i] = sinf(i)*sinf(i);
      B[i] = cosf(i)*cosf(i);
  }
  end = get_time();
  printf("Initialization time : %f seconds elapsed\n", end-start);
  
  
  // Create kernel
  /*kernels = (cl_kernel *) malloc(sizeof(cl_kernel)*num_devices);
  for(int i=0; i<num_devices; i++){
      kernels[i] = clCreateKernel(program,"vec_add", &err);
      CHECK_ERROR(err);
  }*/
  kernel = clCreateKernel(program, "vec_add", &err);
  CHECK_ERROR(err);

  // Create Buffer
  bufferA = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * VECTOR_SIZE, NULL, &err);
  CHECK_ERROR(err);

  bufferB = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * VECTOR_SIZE, NULL, &err);
  CHECK_ERROR(err);

  bufferC = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * VECTOR_SIZE, NULL, &err);
  CHECK_ERROR(err);
  
  printf("error hi\n");
  // Create command-queue
  queues = (cl_command_queue *) malloc(sizeof(cl_command_queue)*num_devices);
  for(int i=0; i<num_devices; i++){
      if (i==0){
          queues[i] = clCreateCommandQueue(context,devices[i],CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,&err);
          CHECK_ERROR(err);
      }
      else{
          queues[i] = clCreateCommandQueue(context,devices[i], CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
          CHECK_ERROR(err);
      }
  }
  printf("error bye\n");
 
  //queue = clCreateCommandQueue(context, device, 0, &err);
  //CHECK_ERROR(err);

  // Write Buffer
  for (int i = 0; i<num_devices; i++){
      err = clEnqueueWriteBuffer(queues[i],bufferA,CL_FALSE,0,sizeof(float)*VECTOR_SIZE,A,0,NULL,NULL);
      CHECK_ERROR(err);
      err = clEnqueueWriteBuffer(queues[i],bufferB,CL_FALSE,0,sizeof(float)*VECTOR_SIZE,B,0,NULL,NULL);
      CHECK_ERROR(err);
  }
  //err = clEnqueueWriteBuffer(queue, bufferA, CL_FALSE, 0, sizeof(float) * VECTOR_SIZE, A, 0, NULL, NULL);
  //CHECK_ERROR(err);
  //err = clEnqueueWriteBuffer(queue, bufferB, CL_FALSE, 0, sizeof(float) * VECTOR_SIZE, B, 0, NULL, NULL);
  //CHECK_ERROR(err);

  for(int i=0; i<num_devices; i++){
      err=clFinish(queues[i]);
      CHECK_ERROR(err);
  }


  // Set Kernel arguments
  start = get_time();
  /*for(int i=0; i<num_devices; i++){
      err=clSetKernelArg(kernels[i], 0, sizeof(cl_mem), &bufferA);
      CHECK_ERROR(err);
      err=clSetKernelArg(kernels[i], 1, sizeof(cl_mem), &bufferB);
      CHECK_ERROR(err);
      err=clSetKernelArg(kernels[i], 2, sizeof(cl_mem), &bufferC);
      CHECK_ERROR(err);
      err=clSetKernelArg(kernels[i], 3, sizeof(unsigned int), &VECTOR_SIZE);
      CHECK_ERROR(err);
  }*/
  err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &bufferA);
  CHECK_ERROR(err);
  err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufferB);
  CHECK_ERROR(err);
  err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufferC);
  CHECK_ERROR(err);
  err = clSetKernelArg(kernel, 3, sizeof(unsigned int), &VECTOR_SIZE);
  CHECK_ERROR(err);

  end = get_time();

  printf("Send Vector A, B to GPU : %f seconds elapsed\n", end - start);

  for(int i=0; i<num_devices; i++){
      err=clFinish(queues[i]);
      CHECK_ERROR(err);
  }

  cl_event ooo_events[num_devices];
  start = get_time();
  // Execute Kernel
  size_t global_size = VECTOR_SIZE;
  size_t local_size = LOCAL_SIZE;
  for(int i=0; i<num_devices; i++){
      //start=get_time();
      
      err= clEnqueueNDRangeKernel(queues[i],kernel,1,NULL,&global_size,&local_size,0,NULL,NULL);
      CHECK_ERROR(err);
      //err = clEnqueueNDRangeKernel(queues[i],kernels[i],1,NULL,&global_size, &local_size,0,NULL,NULL);
      //CHECK_ERROR(err);
      //end=get_time();
      //printf("Calculate C : %f seconds elapsed\n", end-start);
  }
  //err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL,&global_size, &local_size, 0, NULL, NULL);
  //CHECK_ERROR(err);
  for(int i=0; i<num_devices; i++){
      err=clFinish(queues[i]);
      CHECK_ERROR(err);
  }

  end = get_time();

  printf("Calculate C : %f seconds elapsed\n", end - start);

  // Read Buffer
  start = get_time();
  for(int i=0; i<num_devices; i++){
      err = clEnqueueReadBuffer(queues[i],bufferC,CL_TRUE,0,sizeof(float)*VECTOR_SIZE,C,0,NULL,NULL);
      CHECK_ERROR(err);
  }
  //err = clEnqueueReadBuffer(queue, bufferC, CL_TRUE, 0, sizeof(float) * VECTOR_SIZE, C, 0, NULL, NULL);
  //CHECK_ERROR(err);

  end = get_time();
  printf("Receive C from GPU : %f seconds elapsed\n", end - start);

  // Evaluate Vector C
  start = get_time();
  double sum = 0;
  for(int i = 0; i < VECTOR_SIZE; i++) {
    sum += C[i];
  }
  end = get_time();
  printf("Verification time : %f seconds elapsed\n", end-start);
  printf("%lf, %ld \n", sum,VECTOR_SIZE);
  if (abs(VECTOR_SIZE - sum) < 1) {
    printf("Verification success!\n");
  }
  printf("Sum : %f\n", sum);

  // Release OpenCL object
  clReleaseMemObject(bufferA);
  clReleaseMemObject(bufferB);
  clReleaseMemObject(bufferC);
  free(A);
  free(B);
  free(C);
  clReleaseKernel(kernel);
  //clReleaseKernel(kernels[0]);
  //clReleaseKernel(kernels[1]);
  clReleaseProgram(program);
  
  clReleaseCommandQueue(queues[0]);
  clReleaseCommandQueue(queues[1]);
  //clReleaseCommandQueue(queue);
  clReleaseContext(context);

  return 0;
}

Answer 1

Using multiple GPUs is only beneficial in terms of performance if the amount of computational work that each GPU performs takes more time then the communication, scheduling and synchronization overhead.如果每个 GPU 执行的计算工作量比通信、调度和同步开销花费更多的时间，则使用多个 GPU 仅在性能方面有益。 This is true for a single GPU as well.对于单个 GPU 也是如此。

In your case, each GPU performs a simple vector addition.在您的情况下，每个 GPU 执行一个简单的向量加法。 but that rarely takes more time then transferring the data to the GPU, waiting for the kernel to actually get scheduled for execution, etc.但这很少需要更多的时间，然后将数据传输到 GPU，等待内核实际安排执行等。

Your code is not measuring the total kernel execution time but also the scheduling overhead.您的代码不是测量总内核执行时间，而是测量调度开销。

I would advise you to use proper GPU profiling tools (depending on your GPU vendor) instead of manual CPU timings to properly examine what is going on.我建议您使用适当的 GPU 分析工具（取决于您的 GPU 供应商）而不是手动 CPU 计时来正确检查发生了什么。 You can also try measuring kernel execution time via events .您还可以尝试通过 events 测量内核执行时间。

在 OpenCl 中，多个 gpu 比单个 gpu 慢。我怎样才能做得更快？

问题描述

1 个解决方案

解决方案1
1 2020-11-21 10:05:14

在 OpenCl 中，多个 gpu 比单个 gpu 慢。 我怎样才能做得更快？

问题描述

1 个解决方案

解决方案1 1 2020-11-21 10:05:14

在 OpenCl 中，多个 gpu 比单个 gpu 慢。我怎样才能做得更快？

解决方案1
1 2020-11-21 10:05:14