Cuda內核未同時運行

Question

最初，我問的是由於某些原因，當我指定不同的流時，我的內核拒絕同時運行。 現在已經解決了，但是對我來說，它們的並發行為仍然不清楚。

~~我知道我的系統可以運行多個流，因為並發內核CUDA示例運行良好。~~ ~~我也可以擴展此示例，使其模仿我的代碼，並且仍然可以同時運行。~~ 提前為很多代碼道歉。 ~~我想發布所有內容，因為可能有一個小問題阻止我的內核同時運行，或者我認為這可能與擁有結構或大量單獨文件有關。~~ ~~此外，我相信這對嘗試幫助我的所有人都有用！~~ 我剛剛編寫了以下簡化程序來復制我的問題：

testMain.c

#include <stdlib.h>
#include <signal.h>
#include "test.h"

#define Nsim 900000
#define Ncomp 20

Vector* test1;
Vector* test2;
Vector* test3;

cudaStream_t stream1;
cudaStream_t stream2;
cudaStream_t stream3;

int
main (int argc, char **argv)
{
    test1 = Get_Vector(Nsim);
    test2 = Get_Vector(Nsim);
    test3 = Get_Vector(Nsim);

    checkGPU( cudaStreamCreate(&stream1) );
    checkGPU( cudaStreamCreate(&stream2) );
    checkGPU( cudaStreamCreate(&stream3) );

    int x = 0;
    for (x = 0; x < Ncomp; x++)
    {
      computeGPU(test1, test2, test3, x);
      checkGPU( cudaThreadSynchronize() );
    }
    checkGPU( cudaThreadSynchronize() );

    checkGPU( cudaStreamDestroy(stream1) );
    checkGPU( cudaStreamDestroy(stream2) );
    checkGPU( cudaStreamDestroy(stream3) );

    Free_Vector(test1);
    Free_Vector(test2);
    Free_Vector(test3);

    checkGPU( cudaDeviceReset() );
    exit(EXIT_SUCCESS);
}

basics.c

#include <stdlib.h>
#include <stdio.h>
#include <signal.h>
#include "basics.h"

inline void gpuAssert(cudaError_t code, const char *file, int line)
{
  if (code != cudaSuccess) 
    {
      fprintf(stderr,"CUDA error: %s %s %d\n", cudaGetErrorString(code), file, line);
      exit(EXIT_FAILURE);
    }
}

basics.h

#ifndef _BASICS_H
#define _BASICS_H

#include <cuda_runtime.h>

#define checkGPU(ans) { gpuAssert((ans), __FILE__, __LINE__); }

void gpuAssert(cudaError_t code, const char *file, int line);

#endif // _BASICS_H

test.cu

extern "C"
{
#include "test.h"
}

__global__ void compute(int* in, int x)
{
  int i = blockIdx.x*blockDim.x + threadIdx.x;
  in[i] = (int) (x * + 1.05 / 0.4);
}

extern "C" void
computeGPU(Vector* in1, Vector* in2, Vector* in3, int x)
{
  int threadsPerBlock = 256;
  int blocksPerGrid = (in1->N + threadsPerBlock - 1) / threadsPerBlock;
  compute<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(in1->d_data, x);
  compute<<<blocksPerGrid, threadsPerBlock, 0, stream2>>>(in2->d_data, x);
  compute<<<blocksPerGrid, threadsPerBlock, 0, stream3>>>(in3->d_data, x);
}

test.h

#ifndef _TEST_H
#define _TEST_H

#include "vector.h"
#include "basics.h"
#include <cuda_runtime.h>

extern cudaStream_t stream1;
extern cudaStream_t stream2;
extern cudaStream_t stream3;

extern void computeGPU(Vector* in1, Vector* in2, Vector* in3, int x);

#endif // _TEST_H

vector.c

#include <stdlib.h>
#include "vector.h"
#include "basics.h"

Vector*
Get_Vector(int N)
{
  Vector* v = (Vector*) calloc(1, sizeof(Vector));
  v->N = N;
  checkGPU( cudaMalloc((void**) &v->d_data, N * sizeof(int)) );
  return v;
}

void
Free_Vector(Vector* in)
{
  checkGPU( cudaFree(in->d_data) );
  free(in);
}

vector.h

#ifndef _VECTOR_H
#define _VECTOR_H

typedef struct
{
    int N;
    int* d_data;
} Vector;

extern Vector* Get_Vector(int N);

extern void Free_Vector(Vector* in);

#endif // _VECTOR_H

我編譯：

nvcc -gencode arch=compute_20,code=sm_20 -O3 -use_fast_math -lineinfo -o test testMain.c test.cu basics.c vector.c; time ./test

並獲得在nvvp中運行的單獨內核：

內核以串行方式運行，而不是並行運行。

在羅伯茨的幫助下，我通過減少Nsim解決了這個問題。

如果我的問題中Nsim大（900000），則GPU充滿了塊，因此即使在單獨的流中指定，GPU也無法同時運行我的內核。 概要文件結果如上所述。
如果Nsim很小（900），則理論上內核可以並發運行，但是我的內核是如此簡單，它們比啟動下一個內核的開銷要快，因此整個仿真只是在其中啟動Launch Compute（int *，int，int） RuntimeAPI行。 配置文件結果如下所示
如果我對內核和代碼進行了更改，以使內核需要更長的運行時間（並將Nsim設置為合理的值3000，現在就不重要了）：

test.cu

__global__ void compute(int* in, int x, int y)
{
  int i = blockIdx.x*blockDim.x + threadIdx.x;
  in[i] = (int) (x * + 1.05 / 0.4);

  int clock_count = 5000000 * y;
  clock_t start_clock = clock();
  clock_t clock_offset = 0;
  while (clock_offset < clock_count)
  {
    clock_offset = clock() - start_clock;
  }
}

extern "C" void
computeGPU(Vector* in1, Vector* in2, Vector* in3, int x)
{
  int threadsPerBlock = 256;
  int blocksPerGrid = (in1->N + threadsPerBlock - 1) / threadsPerBlock;
  compute<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(in1->d_data, x, 1);
  compute<<<blocksPerGrid, threadsPerBlock, 0, stream2>>>(in2->d_data, x, 2);
  compute<<<blocksPerGrid, threadsPerBlock, 0, stream3>>>(in3->d_data, x, 3);
}

現在，我的內核同時運行，等待這三個內核完成，然后再啟動下三個內核，因為我在循環中進行了同步： 並發運行的內核

但是，如果通過以下更改啟動我的內核，我會期望，因為我在循環中啟動了我的所有內核， 然后進行同步，所以所有內核都應該背靠背運行，最快的內核只需完成運行過程的1/3即可，第二個2/3以及最后一個和結尾。 這是怎么回事 CUDA是否在魔術般地意識到它必須等待較長的內核完成，以某種方式進行了更優化以散布其他內核？ 內核全部啟動，運行時僅在等待一次同步（可以在RuntimeAPI行中看到）。

testMain.c

int x = 0;
for (x = 0; x < Ncomp; x++)
{
  computeGPU(test1, test2, test3, x);
  //checkGPU( cudaThreadSynchronize() );
}
checkGPU( cudaThreadSynchronize() );

並發運行但未按預期運行的內核

此外，使用以下命令啟動內核非常令人困惑，這與預期的不同。 當然，它們可以比使用兩個內核花費相同時間（1x3和3x1）運行，而另一個恰好適合在某個地方運行這些內核的時間同步。

test.cu

extern "C" void
computeGPU(Vector* in1, Vector* in2, Vector* in3, int x)
{
  int threadsPerBlock = 256;
  int blocksPerGrid = (in1->N + threadsPerBlock - 1) / threadsPerBlock;
  compute<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(in1->d_data, x, 1);
  compute<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(in1->d_data, x, 1);
  compute<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(in1->d_data, x, 1);
  compute<<<blocksPerGrid, threadsPerBlock, 0, stream2>>>(in2->d_data, x, 2);
  compute<<<blocksPerGrid, threadsPerBlock, 0, stream3>>>(in3->d_data, x, 3);
}

令人困惑的結果

Answer 1

http://on-demand.gputechconf.com/gtc-express/2011/presentations/StreamsAndConcurrencyWebinar.pdf

請參見幻燈片18，以描述提交並發內核的有效順序。

帶有音頻： https ： //developer.nvidia.com/gpu-computing-webinars

尋找CUDA並發和流。

Cuda內核未同時運行

問題描述

1 個解決方案

解決方案1
0 已采納 2015-02-17 13:42:40

Cuda內核未同時運行

問題描述

1 個解決方案

解決方案1 0 已采納 2015-02-17 13:42:40

解決方案1
0 已采納 2015-02-17 13:42:40