GPU代码运行速度比CPU版本慢

Question

I am working on an application which divides a string into pieces and assigns each to a block. 我正在开发一个应用程序，它将一个字符串分成几个部分并将每个字符串分配给一个块。 Within each block the the text is scanned character by character and a shared array of int, D is to be updated by different threads in parallel based on the character read. 在每个块内，逐个字符地扫描文本，并且基于读取的字符，由不同的线程并行地更新int，D的共享数组。 At the end of each iteration the last element of D is checked, and if it satisfied the condition, a global int array m is set to 1 at the position corresponding to the text. 在每次迭代结束时，检查D的最后一个元素，如果满足条件，则在对应于文本的位置将全局int数组m设置为1。 This code was executed on a NVIDIA GEForce Fermi 550, and runs even slower than the CPU version. 此代码在NVIDIA GEForce Fermi 550上执行，运行速度甚至比CPU版本慢。 I have just included the kernel here: 我刚刚在这里包含了内核：

__global__ void match(uint32_t* BB_d,const char* text_d,int n, int m,int k,int J,int lc,int start_addr,int tBlockSize,int overlap ,int* matched){
    __shared__ int D[MAX_THREADS+2];
    __shared__ char Text_S[MAX_PATTERN_SIZE];
    __shared__ int DNew[MAX_THREADS+2];
    __shared__ int BB_S[4][MAX_THREADS];
    int w=threadIdx.x+1;

    for(int i=0;i<4;i++)
    {
        BB_S[i][threadIdx.x]= BB_d[i*J+threadIdx.x];
    }

    {
        D[threadIdx.x] = 0;
        {
            D[w] = (1<<(k+1)) -1;

            for(int i = 0; i < lc - 1; i++)
            {
                D[w] = (D[w] << k+2) + (1<<(k+1)) -1;
            }
        }
        D[J+1] = (1<<((k+2)*lc)) - 1;
    }
    int startblock=(blockIdx.x == 0?start_addr:(start_addr+(blockIdx.x * (tBlockSize-overlap))));
    int size= (((startblock + tBlockSize) > n )? ((n- (startblock))):( tBlockSize));

    int copyBlock=(size/J)+ ((size%J)==0?0:1);
    if((threadIdx.x * copyBlock) <= size)
        memcpy(Text_S+(threadIdx.x*copyBlock),text_d+(startblock+threadIdx.x*copyBlock),(((((threadIdx.x*copyBlock))+copyBlock) > size)?(size-(threadIdx.x*copyBlock)):copyBlock));
    memcpy(DNew, D, (J+2)*sizeof(int));
    __syncthreads();
    uint32_t initial = D[1];
    uint32_t x;
    uint32_t mask = 1;
    for(int i = 0; i < lc - 1; i++)mask = (mask<<(k+2)) + 1;
    for(int i = 0; i < size;i++)
    {
        {
            x =  ((D[w] >> (k+2)) | (D[w - 1] << ((k + 2)* (lc - 1))) | (BB_S[(((int)Text_S[i])/2)%4][w-1])) & ((1 << (k + 2)* lc) - 1);
            DNew[w] = ((D[w]<<1) | mask)
                & (((D[w] << k+3) | mask|((D[w +1] >>((k+2)*(lc - 1)))<<1)))
                & (((x + mask) ^ x) >> 1)
                & initial;
        }
        __syncthreads();
        memcpy(D, DNew, (J+2)*sizeof(int));
        if(!(D[J] & 1<<(k + (k + 2)*(lc*J -m + k ))))
        {
            matched[startblock+i] = 1;
            D[J] |= ((1<<(k + 1 + (k + 2)*(lc*J -m + k ))) - 1);
        }
    }
}

I am not very familiar with CUDA so I dont quite understand issues such as shared memory bank conflicts. 我对CUDA不太熟悉所以我不太明白共享内存库冲突等问题。 Could that be the bottleneck here? 这可能是这里的瓶颈吗？

As asked, this is the code where I launch the kernels: 如上所述，这是我启动内核的代码：

#include <stdio.h>
#include <assert.h>
#include <cuda.h>
#define uint32_t unsigned int
#define MAX_THREADS 512
#define MAX_PATTERN_SIZE 1024
#define MAX_BLOCKS 8
#define MAX_STREAMS 16
#define TEXT_MAX_LENGTH 1000000000
void calculateBBArray(uint32_t** BB,const char* pattern_h,int m,int k , int lc , int J){};
void checkCUDAError(const char *msg) {
    cudaError_t err = cudaGetLastError();
    if( cudaSuccess != err) 
    {   
            fprintf(stderr, "Cuda error: %s: %s.\n", msg, 
                            cudaGetErrorString( err) );
            exit(EXIT_FAILURE);
    }    
}
char* getTextString() {
 FILE *input, *output;
 char c;
 char * inputbuffer=(char *)malloc(sizeof(char)*TEXT_MAX_LENGTH);

int numchars = 0, index  = 0;

input = fopen("sequence.fasta", "r");
c = fgetc(input);
while(c != EOF)
{
inputbuffer[numchars] = c;
numchars++;
c = fgetc(input);
}
fclose(input);
inputbuffer[numchars] = '\0'; 
return inputbuffer;
}

int main(void) {
const char pattern_h[] = "TACACGAGGAGAGGAGAAGAACAACGCGACAGCAGCAGACTTTTTTTTTTTTACAC";
char * text_h=getTextString();  //reading text from file, supported upto 200MB currently

int k = 13;
int i;
int count=0;
char *pattern_d, *text_d;     // pointers to device memory
char* text_new_d;
int* matched_d;
int* matched_new_d;
uint32_t* BB_d;
uint32_t* BB_new_d;
int* matched_h = (int*)malloc(sizeof(int)* strlen(text_h));
cudaMalloc((void **) &pattern_d, sizeof(char)*strlen(pattern_h)+1);
cudaMalloc((void **) &text_d, sizeof(char)*strlen(text_h)+1);
cudaMalloc((void **) &matched_d, sizeof(int)*strlen(text_h));
cudaMemcpy(pattern_d, pattern_h, sizeof(char)*strlen(pattern_h)+1, cudaMemcpyHostToDevice);
cudaMemcpy(text_d, text_h, sizeof(char)*strlen(text_h)+1, cudaMemcpyHostToDevice);
cudaMemset(matched_d, 0,sizeof(int)*strlen(text_h));

int m = strlen(pattern_h);
int n = strlen(text_h);

uint32_t* BB_h[4];
    unsigned int maxLc = ((((m-k)*(k+2)) > (31))?(31/(k+2)):(m-k));
unsigned int lc=2;   // Determines the number of threads per block
    // can be varied upto maxLc for tuning performance
if(lc>maxLc)
{
    exit(0);
}
unsigned int noWordorNfa =((m-k)/lc) + (((m-k)%lc)  == 0?0:1);
cudaMalloc((void **) &BB_d, sizeof(int)*noWordorNfa*4);
if(noWordorNfa >= MAX_THREADS)
{
    printf("Error: max threads\n");
    exit(0);
}

calculateBBArray(BB_h,pattern_h,m,k,lc,noWordorNfa);  // not included this function

for(i=0;i<4;i++)
{
    cudaMemcpy(BB_d+ i*noWordorNfa, BB_h[i], sizeof(int)*noWordorNfa, cudaMemcpyHostToDevice);
}
int overlap=m;
int textBlockSize=(((m+k+1)>n)?n:(m+k+1));
cudaStream_t stream[MAX_STREAMS];
for(i=0;i<MAX_STREAMS;i++) {
    cudaStreamCreate( &stream[i] );
    }

int start_addr=0,index=0,maxNoBlocks=0;
if(textBlockSize>n)
{
    maxNoBlocks=1;
}
else
{
     maxNoBlocks=((1 + ((n-textBlockSize)/(textBlockSize-overlap)) + (((n-textBlockSize)%(textBlockSize-overlap)) == 0?0:1)));
}
int kernelBlocks = ((maxNoBlocks > MAX_BLOCKS)?MAX_BLOCKS:maxNoBlocks);
int blocksRemaining =maxNoBlocks;
printf(" maxNoBlocks %d kernel Blocks %d \n",maxNoBlocks,kernelBlocks);
while(blocksRemaining >0)
{
kernelBlocks = ((blocksRemaining > MAX_BLOCKS)?MAX_BLOCKS:blocksRemaining);
printf(" Calling %d Blocks with starting Address %d , textBlockSize %d \n",kernelBlocks,start_addr,textBlockSize);
match<<<kernelBlocks,noWordorNfa,0,stream[(index++)%MAX_STREAMS]>>>(BB_d,text_d,n,m,k,noWordorNfa,lc,start_addr,textBlockSize,overlap,matched_d);
start_addr+=kernelBlocks*(textBlockSize-overlap);;
blocksRemaining -= kernelBlocks;
}
cudaMemcpy(matched_h, matched_d, sizeof(int)*strlen(text_h), cudaMemcpyDeviceToHost);
checkCUDAError("Matched Function");
for(i=0;i<MAX_STREAMS;i++)
    cudaStreamSynchronize( stream[i] ); 
    // do stuff with matched
    // ....
    // ....
free(matched_h);cudaFree(pattern_d);cudaFree(text_d);cudaFree(matched_d);
    return 0;

} }

Number of threads launched per block depends upon the length pattern_h(could be at most maxLc above). 每个块启动的线程数取决于pattern_h的长度（最多可以是maxLc）。 I expect it to be around 30 in this case. 在这种情况下，我预计它会在30左右。 Shoudn't that be enough to see a good amount of concurrency? 难道不足以看到大量的并发性吗？ As for blocks, I see no point in launching more than MAX_BLOCKS (=10) at a time since the hardware can schedule only 8 simultaneously 至于块，我认为一次启动超过MAX_BLOCKS（= 10）没有意义，因为硬件只能同时调度8个

NOTE : I don't have GUI access. 注意：我没有GUI访问权限。

Answer 1

使用您正在使用的所有共享内存，如果连续线程没有从共享阵列中的连续地址读取，则可能会遇到存储体冲突...这可能导致内存访问的序列化，从而导致并行性能下降你的算法。

Answer 2

I breifly looked at your code but it looks like your sending data to the gpu back and forth creating a bottle neck on the bus? 我简单地看了你的代码，但看起来你向gpu来回发送数据在公交车上创造一个瓶颈？ did you try profiling it? 你尝试过分析它吗？

Answer 3

I found that I was copying the whole array Dnew to D in each thread rather than copying only the portion each thread was supposed to update D[w]. 我发现我在每个线程中将整个数组Dnew复制到D而不是仅复制每个线程应更新D [w]的部分。 This would cause the threads to execute serially, although I don't know if it could be called a shared memory bank conflict. 这会导致线程串行执行，但我不知道它是否可以被称为共享内存库冲突。 Now it gives 8-9x speedup for large enough patterns(=more threads). 现在它为足够大的模式（=更多线程）提供8-9倍的加速。 This is much less than what I expected. 这远远低于我的预期。 I will try to increase number of blocks as suggested. 我会尝试按照建议增加块数。 I dont know how to increase the # of threads 我不知道如何增加线程数

GPU代码运行速度比CPU版本慢

问题描述

3 个解决方案

解决方案1
3 2012-11-28 01:30:50

解决方案2
1 2012-11-27 21:55:33

解决方案3
1 已采纳 2012-11-28 22:59:47

GPU代码运行速度比CPU版本慢

问题描述

3 个解决方案

解决方案1 3 2012-11-28 01:30:50

解决方案2 1 2012-11-27 21:55:33

解决方案3 1 已采纳 2012-11-28 22:59:47

解决方案1
3 2012-11-28 01:30:50

解决方案2
1 2012-11-27 21:55:33

解决方案3
1 已采纳 2012-11-28 22:59:47