[英]Overlap kernel execution on multiple streams
我們有一個相當單一的內核(見下文),我們使用一個網格以1,1的塊觸發
內核<<< 1,1 >>>
然后,它會動態觸發許多較小的內核。 通常,數據從一個內核流到另一個內核,輸入從第一個內核開始,一直到最后。
但是我們已經確定了重疊兩個數據流的潛在能力,每個數據流都運行相同的內核。
問題:我們是否必須放棄動態內核執行,而采用基於主機的方法來使兩個大型內核的執行重疊? 還是卡中的調度程序足夠智能,可以在兩個大型內核之間交錯執行,並將每個進程作為單獨的調度項進行處理?
我們正在談論特斯拉K80。 Linux主機。
(是的,我們將在與cudamemcopyasync()重疊執行的同時獲得一些重疊,但我們也希望看到一些重疊的執行)。
#include <cuda.h>
#include <cuda_runtime.h>
#include "coss_types.h"
#include "image.h"
#include "centroid.h"
#include "gpu.h"
#define GPU_TILE_WIDTH 16
#define GPU_TILE_HEIGHT 16
#define GPU_TILE_WBIG 32
#define GPU_TILE_HBIG 32
#define K_IMG_MAX 1024
__constant__ unsigned short* pFrameStack[GPU_CHX];
__constant__ unsigned short* pBackground[GPU_CHX];
__constant__ short* pCleanground[GPU_CHX];
__constant__ unsigned char* pMask[GPU_CHX];
__constant__ float* pForeground[GPU_CHX];
__constant__ float* pLowground[GPU_CHX];
__constant__ float* pLowgroundRow[GPU_CHX];
__constant__ float* pHighground[GPU_CHX];
__constant__ float* pHighgroundRow[GPU_CHX];
__constant__ float* pMins[GPU_CHX];
__constant__ float* pMaxs[GPU_CHX];
__constant__ int gSlot;
__constant__ int* pPercentile[GPU_CHX];
__constant__ int* pLabels1[GPU_CHX];
__constant__ int* pLabels2[GPU_CHX];
__constant__ int* pRawLabels[GPU_CHX];
__constant__ int* pLabels[GPU_CHX];
__constant__ ImgInfoBlock_t* pInfo[GPU_CHX];
__constant__ unsigned short* pSums[GPU_CHX];
__constant__ unsigned short* pBlockSums[GPU_CHX];
__constant__ ImgCentroid_t* pCenters[GPU_CHX];
__constant__ float threshold_sigma = 9.0f;
/* INCLUDED GENERATED CUDA CODE BELOW */
#include "cuda.cu"
/* INCLUDED GENERATED CUDA CODE ABOVE */
extern "C" __device__ void Background(int ch)
{
dim3 block;
dim3 grid;
/* Background Estimation */
block = dim3(128);
grid = dim3((IMG_PIXELS)/256); /* Only half screens at a time */
gMedian<<<grid,block>>>(
pFrameStack[ch],GPU_STACKSIZE,IMG_PIXELS,
pBackground[ch],IMG_HEIGHT,IMG_WIDTH,gSlot);
cudaDeviceSynchronize();
/* Background Removal */
block = dim3(128);
grid = dim3((IMG_PIXELS)/128);
gScrub<<<grid,block>>>(
pFrameStack[ch],GPU_STACKSIZE,IMG_PIXELS,
pBackground[ch],IMG_HEIGHT,IMG_WIDTH,
pCleanground[ch],IMG_HEIGHT,IMG_WIDTH,gSlot);
cudaDeviceSynchronize();
}
extern "C" __device__ void Convolution(int ch)
{
dim3 block;
dim3 grid;
dim3 block_b;
dim3 grid_b;
/* Convolve Rows */
block = dim3(GPU_TILE_WIDTH,GPU_TILE_HEIGHT);
grid = dim3(IMG_WIDTH/GPU_TILE_WIDTH,IMG_HEIGHT/GPU_TILE_HEIGHT);
gConvolveRow<<<grid,block>>>(
pCleanground[ch], IMG_HEIGHT,IMG_WIDTH,
pLowgroundRow[ch], IMG_HEIGHT,IMG_WIDTH);
block_b = dim3(GPU_TILE_WBIG,GPU_TILE_HBIG);
grid_b = dim3(IMG_WIDTH/GPU_TILE_WBIG,IMG_HEIGHT/GPU_TILE_HBIG);
gConvolveBigRow<<<grid_b,block_b>>>(
pCleanground[ch], IMG_HEIGHT,IMG_WIDTH,
pHighgroundRow[ch], IMG_HEIGHT,IMG_WIDTH);
/* Convolve Cols */
cudaDeviceSynchronize();
gConvolveCol<<<grid,block>>>(
pLowgroundRow[ch], IMG_HEIGHT,IMG_WIDTH,
pLowground[ch], IMG_HEIGHT,IMG_WIDTH);
gConvolveBigCol<<<grid_b,block_b>>>(
pHighgroundRow[ch], IMG_HEIGHT,IMG_WIDTH,
pHighground[ch], IMG_HEIGHT,IMG_WIDTH);
/* Band pass */
cudaDeviceSynchronize();
block = dim3(256,4);
grid = dim3(IMG_WIDTH / 256, IMG_HEIGHT / 4);
gBpass<<<grid,block>>>(
pLowground[ch], IMG_HEIGHT,IMG_WIDTH,
pHighground[ch], IMG_HEIGHT,IMG_WIDTH,
pForeground[ch], IMG_HEIGHT,IMG_WIDTH);
cudaDeviceSynchronize();
}
extern "C" __device__ void Threshold(int ch)
{
dim3 block;
dim3 grid;
/* Set the calibration sigma in Info Bloc */
pInfo[ch]->sigma = threshold_sigma;
/* Min Max kernels */
block = dim3(512, 2);
grid = dim3(IMG_WIDTH / 512, IMG_HEIGHT / 2);
gMinMax<<<grid,block>>>(
pForeground[ch],IMG_HEIGHT,IMG_WIDTH,
pMins[ch], 5 * K_IMG_MAX,
pMaxs[ch], 5 * K_IMG_MAX);
cudaDeviceSynchronize();
block = dim3(K_IMG_MAX);
grid = dim3(1);
gMinMaxMinMax<<<grid,K_IMG_MAX>>>(
pMins[ch], 5 * K_IMG_MAX,
pMaxs[ch], 5 * K_IMG_MAX,
(struct PipeInfoBlock*)pInfo[ch],1);
/* Histogram */
cudaDeviceSynchronize();
block = dim3(GPU_TILE_WBIG,GPU_TILE_HBIG);
grid = dim3(IMG_WIDTH/GPU_TILE_WBIG,IMG_HEIGHT/GPU_TILE_HBIG);
gHistogram<<<grid,block>>>(
pForeground[ch],IMG_HEIGHT,IMG_WIDTH,
pPercentile[ch],K_IMG_MAX,
(struct PipeInfoBlock*)pInfo[ch],1);
cudaDeviceSynchronize();
block = dim3(K_IMG_MAX);
grid = dim3(1);
gSumHistogram<<<grid,block>>>(pPercentile[ch],K_IMG_MAX);
cudaDeviceSynchronize();
gIQR<<<grid,block>>>(pPercentile[ch],K_IMG_MAX,(struct PipeInfoBlock*)pInfo[ch],1);
cudaDeviceSynchronize();
block = dim3(256,4);
grid = dim3(IMG_WIDTH / 256, IMG_HEIGHT / 4);
gThreshold<<<grid,block>>>(
pForeground[ch],IMG_HEIGHT,IMG_WIDTH,
pMask[ch],IMG_HEIGHT,IMG_WIDTH,
(struct PipeInfoBlock*)pInfo[ch],1);
cudaDeviceSynchronize();
}
extern "C" __device__ void Gluing(int ch)
{
dim3 block;
dim3 grid;
block = dim3(24, 24);
grid = dim3(IMG_WIDTH / 16, IMG_HEIGHT / 16);
gGlue<<<grid, block>>>(
pMask[ch],IMG_HEIGHT,IMG_WIDTH,
pMask[ch],IMG_HEIGHT,IMG_WIDTH);
cudaDeviceSynchronize();
}
extern "C" __device__ void Labeling(int ch)
{
dim3 block;
dim3 grid;
/* CCL */
//block = dim3(1, 128);
//grid = dim3(1, IMG_HEIGHT / 128);
block = dim3(256,1);
grid = dim3(IMG_WIDTH/256,IMG_HEIGHT);
gCCL0<<<grid, block>>>(
pMask[ch],IMG_HEIGHT,IMG_WIDTH,
pRawLabels[ch],IMG_HEIGHT,IMG_WIDTH);
cudaDeviceSynchronize();
block = dim3(24, 24);
grid = dim3(IMG_WIDTH / 16, IMG_HEIGHT / 16);
gCCLMerge<<<grid, block>>>(
pRawLabels[ch],IMG_HEIGHT,IMG_WIDTH,
pRawLabels[ch],IMG_HEIGHT,IMG_WIDTH);
cudaDeviceSynchronize();
gCCLMerge<<<grid, block>>>(
pRawLabels[ch],IMG_HEIGHT,IMG_WIDTH,
pRawLabels[ch],IMG_HEIGHT,IMG_WIDTH);
cudaDeviceSynchronize();
gCCLMerge<<<grid, block>>>(
pRawLabels[ch],IMG_HEIGHT,IMG_WIDTH,
pRawLabels[ch],IMG_HEIGHT,IMG_WIDTH);
cudaDeviceSynchronize();
}
extern "C" __device__ void Relabeling(int ch)
{
dim3 block;
dim3 grid;
/* Relabel */
block = dim3(160, 1);
grid = dim3(IMG_WIDTH / 160, IMG_HEIGHT / 1);
gScan<<<grid, block>>>(
pRawLabels[ch],IMG_HEIGHT,IMG_WIDTH,
pSums[ch],IMG_PIXELS);
cudaDeviceSynchronize();
grid = dim3(IMG_PIXELS / K_IMG_MAX);
gSum<<<grid, K_IMG_MAX>>>(pSums[ch],IMG_PIXELS, pBlockSums[ch], 5*K_IMG_MAX);
cudaDeviceSynchronize();
grid = dim3(1);
gSumBlocks<<<grid, K_IMG_MAX>>>(pBlockSums[ch], 5*K_IMG_MAX, (struct PipeInfoBlock*)pInfo[ch],1);
cudaDeviceSynchronize();
grid = dim3(IMG_PIXELS / K_IMG_MAX);
gFixSums<<<grid, K_IMG_MAX>>>(pSums[ch],IMG_PIXELS, pBlockSums[ch], 5*K_IMG_MAX);
cudaDeviceSynchronize();
block = dim3(160, 1);
grid = dim3(IMG_WIDTH / 160, IMG_HEIGHT / 1);
gRelabeler<<<grid, block>>>(
pRawLabels[ch],IMG_HEIGHT,IMG_WIDTH,
pSums[ch],IMG_PIXELS,
pLabels[ch], IMG_HEIGHT,IMG_WIDTH);
cudaDeviceSynchronize();
}
extern "C" __device__ void Centroiding(int ch)
{
dim3 block;
dim3 grid;
int starcount = IMG_STARS_MAX;
if (pInfo[ch]->starCount > 0 && pInfo[ch]->starCount < IMG_STARS_MAX)
{
starcount = pInfo[ch]->starCount;
/* Centroid */
block = dim3(32, 32);
grid = dim3(IMG_WIDTH / 32, IMG_HEIGHT / 32);
gCentroid<<<grid, block>>>(
pLabels[ch], IMG_HEIGHT,IMG_WIDTH,
pForeground[ch],IMG_HEIGHT,IMG_WIDTH,
(PipeCentroid *)pCenters[ch],starcount);
cudaDeviceSynchronize();
block = dim3(starcount);
gCentroidFinal<<<1, block>>>((PipeCentroid *)pCenters[ch],starcount);
cudaDeviceSynchronize();
}
else
{
pInfo[ch]->starCount = 0;
}
}
extern "C" __global__ void gPipeline(int gpuId)
{ int ch;
for(ch=0; ch < GPU_CHX; ch++)
{
Background(ch);
Convolution(ch);
Threshold(ch);
Gluing(ch);
Labeling(ch);
Relabeling(ch);
Centroiding(ch);
}
}
extern "C" {
static void ImgKernel_ClearBuffers(int32_t gpu, int32_t ch)
{
/* Clear Work Buffers */
cudaMemset(gInfo[gpu][ch],0,(int)sizeof(ImgInfoBlock_t));
cudaMemset(gCenters[gpu][ch],0,(int)sizeof(ImgCentroid_t)*IMG_STARS_MAX);
cudaMemset(gPercentile[gpu][ch],0,(int)sizeof(int32_t)*K_IMG_MAX);
cudaMemset(gLabels1[gpu][ch],0,(int)sizeof(int32_t) *IMG_PIXELS);
cudaMemset(gLabels2[gpu][ch],0,(int)sizeof(int32_t) *IMG_PIXELS);
cudaMemset(gRawLabels[gpu][ch],0,(int)sizeof(int32_t) *IMG_PIXELS);
cudaMemset(gSums[gpu][ch],0,(int)IMG_BYTES);
cudaMemset(gBlockSums[gpu][ch],0,(int)sizeof(uint16_t)*5*K_IMG_MAX);
}
void ImgKernel_Pipeline(int gpu)
{
cudaSetDevice(gpu);
cudaDeviceSynchronize();
/* Start a new run by clearing the buffers */
ImgKernel_ClearBuffers(gpu,GPU_CH0);
ImgKernel_ClearBuffers(gpu,GPU_CH1);
/* Update Constants */
cudaMemcpyToSymbol(gSlot,(void*)&slot,sizeof(slot));
cudaMemcpyToSymbol(threshold_sigma,(void*)&sigmaThreshold,sizeof(sigmaThreshold));
/* Start the next pipeline kernel */
gPipeline<<<1,1>>>(gpu);
}
#define LFILTER_LEN 15
static float lFilter[LFILTER_LEN] = { .0009f, .01f,
.02f, .05f, .08f, .10f, .1325f, .1411f, .1325f, .10f, .08f, .05f, .02f, .01f, .0009f };
#define HFILTER_LEN 31
static float hFilter[HFILTER_LEN] = {0.0002f, 0.0006f,
0.0025f, 0.0037f, 0.0053f, 0.0074f, 0.0099f, 0.0130f, 0.0164f,
0.0201f, 0.0239f, 0.0275f, 0.0306f, 0.0331f, 0.0347f, 0.0353f,
0.0347f, 0.0331f, 0.0306f, 0.0275f, 0.0239f, 0.0201f, 0.0164f,
0.0130f, 0.0099f, 0.0074f, 0.0053f, 0.0037f, 0.0025f, 0.0006f, 0.0002f};
static float32_t kernel[LFILTER_LEN];
static float32_t kernelBig[HFILTER_LEN];
static inline float32_t ImgKernel_FilterSum(float* arr, int32_t len)
{
int32_t i;
float32_t sum = 0.0f;
for (i=0;i<len;i++) sum += arr[i];
return sum;
}
void ImgKernel_Setup(int gpu)
{
int32_t i,ch;
float32_t sum = 0;
sum = ImgKernel_FilterSum(lFilter,LFILTER_LEN);
for (i = 0; i < LFILTER_LEN; i++) kernel[i] = lFilter[i] / sum;
sum = ImgKernel_FilterSum(hFilter,HFILTER_LEN);
for (i = 0; i < HFILTER_LEN; i++) kernelBig[i] = hFilter[i] / sum;
/* One time copy of locations into GPU constant memory */
cudaMemcpyToSymbol(gkernel, (void*)&kernel, sizeof(float32_t)*LFILTER_LEN);
cudaMemcpyToSymbol(gkernelBig, (void*)&kernelBig, sizeof(float32_t)*HFILTER_LEN);
cudaMemcpyToSymbol(pFrameStack,(void*)&gFrameStack[gpu][0], sizeof(uint16_t*)*GPU_CHX);
cudaMemcpyToSymbol(pBackground,(void*)&gBackground[gpu][0], sizeof(uint16_t*)*GPU_CHX);
cudaMemcpyToSymbol(pCleanground,(void*)&gCleanground[gpu][0], sizeof(int16_t*)*GPU_CHX);
cudaMemcpyToSymbol(pLowground, (void*)&gLowground[gpu][0], sizeof(float32_t*)*GPU_CHX);
cudaMemcpyToSymbol(pLowgroundRow,(void*)&gLowgroundRow[gpu][0],sizeof(float32_t*)*GPU_CHX);
cudaMemcpyToSymbol(pHighground,(void*)&gHighground[gpu][0], sizeof(float32_t*)*GPU_CHX);
cudaMemcpyToSymbol(pHighgroundRow,(void*)&gHighgroundRow[gpu][0],sizeof(float32_t*)*GPU_CHX);
cudaMemcpyToSymbol(pForeground,(void*)&gForeground[gpu][0], sizeof(float32_t*)*GPU_CHX);
cudaMemcpyToSymbol(pMask, (void*)&gMask[gpu][0], sizeof(uint8_t*)*GPU_CHX);
cudaMemcpyToSymbol(pPercentile,(void*)&gPercentile[gpu][0], sizeof(int32_t*)*GPU_CHX);
cudaMemcpyToSymbol(pMins, (void*)&gMins[gpu][0], sizeof(float32_t*)*GPU_CHX);
cudaMemcpyToSymbol(pMaxs, (void*)&gMaxs[gpu][0], sizeof(float32_t*)*GPU_CHX);
cudaMemcpyToSymbol(pLabels1, (void*)&gLabels1[gpu][0], sizeof(int32_t*)*GPU_CHX);
cudaMemcpyToSymbol(pLabels2, (void*)&gLabels2[gpu][0], sizeof(int32_t*)*GPU_CHX);
cudaMemcpyToSymbol(pRawLabels, (void*)&gRawLabels[gpu][0], sizeof(int32_t*)*GPU_CHX);
cudaMemcpyToSymbol(pLabels, (void*)&gLabels[gpu][0], sizeof(int32_t*)*GPU_CHX);
cudaMemcpyToSymbol(pInfo, (void*)&gInfo[gpu][0], sizeof(ImgInfoBlock_t*)*GPU_CHX);
cudaMemcpyToSymbol(pSums, (void*)&gSums[gpu][0], sizeof(uint16_t*)*GPU_CHX);
cudaMemcpyToSymbol(pBlockSums, (void*)&gBlockSums[gpu][0], sizeof(uint16_t*)*GPU_CHX);
cudaMemcpyToSymbol(pCenters, (void*)&gCenters[gpu][0], sizeof(ImgCentroid_t*)*GPU_CHX);
for (ch = 0; ch < GPU_CHX; ch++)
{
/* Clear the working buffers */
ImgKernel_ClearBuffers(gpu,ch);
}
}
}
對於在單獨的主機流中啟動的兩個動態並行內核,父內核和子內核都應可能共存(即同時執行)。
如何使事物並發運行是一個常見的問題。 一旦滿足所有要求 ,您是否實際見證並發內核執行將取決於每個內核消耗的資源:每個塊有多少線程,總線程塊有多少,寄存器有多少以及共享內存有多少?幾個資源類型的示例,即使已滿足所有要求,但如果被一個內核占用,可能會阻止另一個內核並發執行。
機器沒有無限容量。 一旦機器的容量被消耗掉,暴露額外的並行性(例如,通過嘗試同時啟動獨立的內核)可能不會帶來任何改善。
Greg指出,GPU調度行為可能會影響這一點。 取決於特定的GPU和CUDA版本以及可能的其他因素,兩個具有大量線程塊的內核可能不會“並發”執行,僅僅是因為一個內核的線程塊可能都在另一個內核的任何線程塊被調度之前就已被調度了。 我認為,這種行為只是資源問題的另一種表現。 (還請注意,各個內核的線程塊的調度也可能會受到流優先級的影響)。
但是,如果我們謹慎地限制資源使用,則兩個動態並行內核的父內核和子內核可能會共存,即並發執行。 這是一個可行的示例(CUDA 7,Fedora 20,GeForce GT640 cc3.5 GPU):
$ cat t815.cu
#include <stdio.h>
#define DELAY_VAL 5000000000ULL
__global__ void child(){
unsigned long long start = clock64();
while (clock64()< start+DELAY_VAL);
}
__global__ void parent(){
child<<<1,1>>>();
}
int main(int argc, char* argv[]){
cudaStream_t st1, st2;
cudaStreamCreate(&st1);
cudaStreamCreate(&st2);
parent<<<1,1,0,st1>>>();
if (argc > 1){
printf("running double kernel\n");
parent<<<1,1,0,st2>>>();
}
cudaDeviceSynchronize();
}
$ nvcc -arch=sm_35 -rdc=true -lcudadevrt t815.cu -o t815
$ time ./t815
3.65user 1.88system 0:05.65elapsed 97%CPU (0avgtext+0avgdata 82192maxresident)k
0inputs+0outputs (0major+2812minor)pagefaults 0swaps
$ time ./t815 double
running double kernel
3.68user 1.83system 0:05.64elapsed 97%CPU (0avgtext+0avgdata 82200maxresident)k
0inputs+0outputs (0major+2814minor)pagefaults 0swaps
$ time cuda-memcheck ./t815
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
3.16user 2.25system 0:05.68elapsed 95%CPU (0avgtext+0avgdata 87040maxresident)k
0inputs+0outputs (0major+4573minor)pagefaults 0swaps
$ time cuda-memcheck ./t815 double
========= CUDA-MEMCHECK
running double kernel
========= ERROR SUMMARY: 0 errors
7.27user 3.04system 0:10.46elapsed 98%CPU (0avgtext+0avgdata 87116maxresident)k
0inputs+0outputs (0major+4594minor)pagefaults 0swaps
$
在這種情況下,我們看到如果我不使用cuda-memcheck
,那么無論我是在單獨的主機流中運行(父)內核的一個還是兩個副本,執行時間都大約相同(〜5.6s)。 由於執行時間相同,因此不可避免的結論是這些內核正在同時執行(父內核和兩個子內核)。 這並不奇怪,因為這些內核占用的資源很少。 (每個線程一個線程塊,每個線程一個線程,寄存器使用率非常低,並且沒有共享內存使用率)。
另一方面,如果我使用cuda-memcheck
運行相同的測試, cuda-memcheck
發現明顯的序列化,因為盡管單個內核啟動的時間相對不受影響,但是兩次“並發”內核啟動的時間大約是原來的兩倍。
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.