Originally I was asking that for some reason my kernels refused to run concurrently when I specify different streams. This has now be solved, however their concurrent behaviour is still not clear to me.
I know my system can run multiple streams, as the concurrentKernels CUDA sample runs fine.
I can also extend this example so it mimicks my code and it still runs concurrently.
Apologies in advance for lots of code.
I wanted to post it all as there is probably one small thing blocks my kernels running concurrently or I'm thinking it might be something to do with having structures or lots of separate files.
Furthermore I am sure it is useful to you all when attempting to help me!
I just wrote the following simplified programme which replicates my problem:
testMain.c
#include <stdlib.h>
#include <signal.h>
#include "test.h"
#define Nsim 900000
#define Ncomp 20
Vector* test1;
Vector* test2;
Vector* test3;
cudaStream_t stream1;
cudaStream_t stream2;
cudaStream_t stream3;
int
main (int argc, char **argv)
{
test1 = Get_Vector(Nsim);
test2 = Get_Vector(Nsim);
test3 = Get_Vector(Nsim);
checkGPU( cudaStreamCreate(&stream1) );
checkGPU( cudaStreamCreate(&stream2) );
checkGPU( cudaStreamCreate(&stream3) );
int x = 0;
for (x = 0; x < Ncomp; x++)
{
computeGPU(test1, test2, test3, x);
checkGPU( cudaThreadSynchronize() );
}
checkGPU( cudaThreadSynchronize() );
checkGPU( cudaStreamDestroy(stream1) );
checkGPU( cudaStreamDestroy(stream2) );
checkGPU( cudaStreamDestroy(stream3) );
Free_Vector(test1);
Free_Vector(test2);
Free_Vector(test3);
checkGPU( cudaDeviceReset() );
exit(EXIT_SUCCESS);
}
basics.c
#include <stdlib.h>
#include <stdio.h>
#include <signal.h>
#include "basics.h"
inline void gpuAssert(cudaError_t code, const char *file, int line)
{
if (code != cudaSuccess)
{
fprintf(stderr,"CUDA error: %s %s %d\n", cudaGetErrorString(code), file, line);
exit(EXIT_FAILURE);
}
}
basics.h
#ifndef _BASICS_H
#define _BASICS_H
#include <cuda_runtime.h>
#define checkGPU(ans) { gpuAssert((ans), __FILE__, __LINE__); }
void gpuAssert(cudaError_t code, const char *file, int line);
#endif // _BASICS_H
test.cu
extern "C"
{
#include "test.h"
}
__global__ void compute(int* in, int x)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
in[i] = (int) (x * + 1.05 / 0.4);
}
extern "C" void
computeGPU(Vector* in1, Vector* in2, Vector* in3, int x)
{
int threadsPerBlock = 256;
int blocksPerGrid = (in1->N + threadsPerBlock - 1) / threadsPerBlock;
compute<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(in1->d_data, x);
compute<<<blocksPerGrid, threadsPerBlock, 0, stream2>>>(in2->d_data, x);
compute<<<blocksPerGrid, threadsPerBlock, 0, stream3>>>(in3->d_data, x);
}
test.h
#ifndef _TEST_H
#define _TEST_H
#include "vector.h"
#include "basics.h"
#include <cuda_runtime.h>
extern cudaStream_t stream1;
extern cudaStream_t stream2;
extern cudaStream_t stream3;
extern void computeGPU(Vector* in1, Vector* in2, Vector* in3, int x);
#endif // _TEST_H
vector.c
#include <stdlib.h>
#include "vector.h"
#include "basics.h"
Vector*
Get_Vector(int N)
{
Vector* v = (Vector*) calloc(1, sizeof(Vector));
v->N = N;
checkGPU( cudaMalloc((void**) &v->d_data, N * sizeof(int)) );
return v;
}
void
Free_Vector(Vector* in)
{
checkGPU( cudaFree(in->d_data) );
free(in);
}
vector.h
#ifndef _VECTOR_H
#define _VECTOR_H
typedef struct
{
int N;
int* d_data;
} Vector;
extern Vector* Get_Vector(int N);
extern void Free_Vector(Vector* in);
#endif // _VECTOR_H
I compile with:
nvcc -gencode arch=compute_20,code=sm_20 -O3 -use_fast_math -lineinfo -o test testMain.c test.cu basics.c vector.c; time ./test
And get separate kernels running in nvvp:
With Roberts's help I solved this problem by reducing Nsim.
If Nsim is small (900), the kernels can in theory run concurrently however my kernel is so simple they finish quicker than the overhead of launching the next kernel, therefore the whole simulation is just Launch Compute(int*,int,int) in the RuntimeAPI row. The profile results look like this
If I make changes to my kernel and code such that the kernel takes longer to run (and set Nsim to something reasonable, 3000, not important now):
test.cu
__global__ void compute(int* in, int x, int y)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
in[i] = (int) (x * + 1.05 / 0.4);
int clock_count = 5000000 * y;
clock_t start_clock = clock();
clock_t clock_offset = 0;
while (clock_offset < clock_count)
{
clock_offset = clock() - start_clock;
}
}
extern "C" void
computeGPU(Vector* in1, Vector* in2, Vector* in3, int x)
{
int threadsPerBlock = 256;
int blocksPerGrid = (in1->N + threadsPerBlock - 1) / threadsPerBlock;
compute<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(in1->d_data, x, 1);
compute<<<blocksPerGrid, threadsPerBlock, 0, stream2>>>(in2->d_data, x, 2);
compute<<<blocksPerGrid, threadsPerBlock, 0, stream3>>>(in3->d_data, x, 3);
}
My kernels now run concurrently waiting for the three to finish before launching the next three because I synchronise within my loop:
testMain.c
int x = 0;
for (x = 0; x < Ncomp; x++)
{
computeGPU(test1, test2, test3, x);
//checkGPU( cudaThreadSynchronize() );
}
checkGPU( cudaThreadSynchronize() );
test.cu
extern "C" void
computeGPU(Vector* in1, Vector* in2, Vector* in3, int x)
{
int threadsPerBlock = 256;
int blocksPerGrid = (in1->N + threadsPerBlock - 1) / threadsPerBlock;
compute<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(in1->d_data, x, 1);
compute<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(in1->d_data, x, 1);
compute<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(in1->d_data, x, 1);
compute<<<blocksPerGrid, threadsPerBlock, 0, stream2>>>(in2->d_data, x, 2);
compute<<<blocksPerGrid, threadsPerBlock, 0, stream3>>>(in3->d_data, x, 3);
}
http://on-demand.gputechconf.com/gtc-express/2011/presentations/StreamsAndConcurrencyWebinar.pdf
look at slide 18 for a description on an efficient order for submitting concurrent kernels.
With audio: https://developer.nvidia.com/gpu-computing-webinars
look for cuda concurrency & streams.
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.