Cuda kernels not running concurrent

Question

Originally I was asking that for some reason my kernels refused to run concurrently when I specify different streams. This has now be solved, however their concurrent behaviour is still not clear to me.

~~I know my system can run multiple streams, as the concurrentKernels CUDA sample runs fine.~~ ~~I can also extend this example so it mimicks my code and it still runs concurrently.~~ Apologies in advance for lots of code. ~~I wanted to post it all as there is probably one small thing blocks my kernels running concurrently or I'm thinking it might be something to do with having structures or lots of separate files.~~ ~~Furthermore I am sure it is useful to you all when attempting to help me!~~ I just wrote the following simplified programme which replicates my problem:

testMain.c

#include <stdlib.h>
#include <signal.h>
#include "test.h"

#define Nsim 900000
#define Ncomp 20

Vector* test1;
Vector* test2;
Vector* test3;

cudaStream_t stream1;
cudaStream_t stream2;
cudaStream_t stream3;

int
main (int argc, char **argv)
{
    test1 = Get_Vector(Nsim);
    test2 = Get_Vector(Nsim);
    test3 = Get_Vector(Nsim);

    checkGPU( cudaStreamCreate(&stream1) );
    checkGPU( cudaStreamCreate(&stream2) );
    checkGPU( cudaStreamCreate(&stream3) );

    int x = 0;
    for (x = 0; x < Ncomp; x++)
    {
      computeGPU(test1, test2, test3, x);
      checkGPU( cudaThreadSynchronize() );
    }
    checkGPU( cudaThreadSynchronize() );

    checkGPU( cudaStreamDestroy(stream1) );
    checkGPU( cudaStreamDestroy(stream2) );
    checkGPU( cudaStreamDestroy(stream3) );

    Free_Vector(test1);
    Free_Vector(test2);
    Free_Vector(test3);

    checkGPU( cudaDeviceReset() );
    exit(EXIT_SUCCESS);
}

basics.c

#include <stdlib.h>
#include <stdio.h>
#include <signal.h>
#include "basics.h"

inline void gpuAssert(cudaError_t code, const char *file, int line)
{
  if (code != cudaSuccess) 
    {
      fprintf(stderr,"CUDA error: %s %s %d\n", cudaGetErrorString(code), file, line);
      exit(EXIT_FAILURE);
    }
}

basics.h

#ifndef _BASICS_H
#define _BASICS_H

#include <cuda_runtime.h>

#define checkGPU(ans) { gpuAssert((ans), __FILE__, __LINE__); }

void gpuAssert(cudaError_t code, const char *file, int line);

#endif // _BASICS_H

test.cu

extern "C"
{
#include "test.h"
}

__global__ void compute(int* in, int x)
{
  int i = blockIdx.x*blockDim.x + threadIdx.x;
  in[i] = (int) (x * + 1.05 / 0.4);
}

extern "C" void
computeGPU(Vector* in1, Vector* in2, Vector* in3, int x)
{
  int threadsPerBlock = 256;
  int blocksPerGrid = (in1->N + threadsPerBlock - 1) / threadsPerBlock;
  compute<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(in1->d_data, x);
  compute<<<blocksPerGrid, threadsPerBlock, 0, stream2>>>(in2->d_data, x);
  compute<<<blocksPerGrid, threadsPerBlock, 0, stream3>>>(in3->d_data, x);
}

test.h

#ifndef _TEST_H
#define _TEST_H

#include "vector.h"
#include "basics.h"
#include <cuda_runtime.h>

extern cudaStream_t stream1;
extern cudaStream_t stream2;
extern cudaStream_t stream3;

extern void computeGPU(Vector* in1, Vector* in2, Vector* in3, int x);

#endif // _TEST_H

vector.c

#include <stdlib.h>
#include "vector.h"
#include "basics.h"

Vector*
Get_Vector(int N)
{
  Vector* v = (Vector*) calloc(1, sizeof(Vector));
  v->N = N;
  checkGPU( cudaMalloc((void**) &v->d_data, N * sizeof(int)) );
  return v;
}

void
Free_Vector(Vector* in)
{
  checkGPU( cudaFree(in->d_data) );
  free(in);
}

vector.h

#ifndef _VECTOR_H
#define _VECTOR_H

typedef struct
{
    int N;
    int* d_data;
} Vector;

extern Vector* Get_Vector(int N);

extern void Free_Vector(Vector* in);

#endif // _VECTOR_H

I compile with:

nvcc -gencode arch=compute_20,code=sm_20 -O3 -use_fast_math -lineinfo -o test testMain.c test.cu basics.c vector.c; time ./test

And get separate kernels running in nvvp:

内核以串行方式运行，而不是并行运行。

With Roberts's help I solved this problem by reducing Nsim.

If Nsim is big (900000) as in my question, the GPU is full of blocks and therefore cannot run my kernels concurrently even if specified in separate streams. The profile results are as above.
If Nsim is small (900), the kernels can in theory run concurrently however my kernel is so simple they finish quicker than the overhead of launching the next kernel, therefore the whole simulation is just Launch Compute(int*,int,int) in the RuntimeAPI row. The profile results look like this
If I make changes to my kernel and code such that the kernel takes longer to run (and set Nsim to something reasonable, 3000, not important now):

test.cu

__global__ void compute(int* in, int x, int y)
{
  int i = blockIdx.x*blockDim.x + threadIdx.x;
  in[i] = (int) (x * + 1.05 / 0.4);

  int clock_count = 5000000 * y;
  clock_t start_clock = clock();
  clock_t clock_offset = 0;
  while (clock_offset < clock_count)
  {
    clock_offset = clock() - start_clock;
  }
}

extern "C" void
computeGPU(Vector* in1, Vector* in2, Vector* in3, int x)
{
  int threadsPerBlock = 256;
  int blocksPerGrid = (in1->N + threadsPerBlock - 1) / threadsPerBlock;
  compute<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(in1->d_data, x, 1);
  compute<<<blocksPerGrid, threadsPerBlock, 0, stream2>>>(in2->d_data, x, 2);
  compute<<<blocksPerGrid, threadsPerBlock, 0, stream3>>>(in3->d_data, x, 3);
}

My kernels now run concurrently waiting for the three to finish before launching the next three because I synchronise within my loop: 并发运行的内核

However, if launch my kernels with the following changes I would expect that because I launch all my kernels in the loop and then synchronise, the kernels should all run back to back and the quickest ones just finish 1/3 of the way through the run, the second 2/3 and the last and the end. What is happening here? Is CUDA doing some magic to realise that it has to wait for the long kernels to finish anyway so somehow being more optimized to intersperse running the others? The kernels are all launched and the runtime is just waiting on the one synchronise (this can be seen in the RuntimeAPI row).

testMain.c

int x = 0;
for (x = 0; x < Ncomp; x++)
{
  computeGPU(test1, test2, test3, x);
  //checkGPU( cudaThreadSynchronize() );
}
checkGPU( cudaThreadSynchronize() );

并发运行但未按预期运行的内核

Furthermore, launching kernels with the following is very confusing, not as expected. Surely they can synchronise up better than this with two kernels taking the same amount of time to run (1x3 and 3x1) and the other just fitting in the time to run these somewhere.

test.cu

extern "C" void
computeGPU(Vector* in1, Vector* in2, Vector* in3, int x)
{
  int threadsPerBlock = 256;
  int blocksPerGrid = (in1->N + threadsPerBlock - 1) / threadsPerBlock;
  compute<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(in1->d_data, x, 1);
  compute<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(in1->d_data, x, 1);
  compute<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(in1->d_data, x, 1);
  compute<<<blocksPerGrid, threadsPerBlock, 0, stream2>>>(in2->d_data, x, 2);
  compute<<<blocksPerGrid, threadsPerBlock, 0, stream3>>>(in3->d_data, x, 3);
}

令人困惑的结果

Answer 1

http://on-demand.gputechconf.com/gtc-express/2011/presentations/StreamsAndConcurrencyWebinar.pdf

look at slide 18 for a description on an efficient order for submitting concurrent kernels.

With audio: https://developer.nvidia.com/gpu-computing-webinars

look for cuda concurrency & streams.

Cuda kernels not running concurrent

Question

1 answers

solution1
0 ACCPTED 2015-02-17 13:42:40

Cuda kernels not running concurrent

Question

1 answers

solution1 0 ACCPTED 2015-02-17 13:42:40

solution1
0 ACCPTED 2015-02-17 13:42:40