并发内核启动示例-CUDA

Question

我正在尝试为非常复杂的CUDA内核实现并发内核启动，因此我想从一个简单的示例开始。 它只是启动一个减少总和的内核。 很简单。 这里是：

#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <cuda.h>

extern __shared__ char dsmem[];
__device__ double *scratch_space;

__device__ double NDreduceSum(double *a, unsigned short length)
{
    const int tid = threadIdx.x;
    unsigned short k = length;
    double *b;

    b = scratch_space;

    for (int i = tid; i < length; i+= blockDim.x)
        b[i] = a[i];

    __syncthreads();

    do {
        k = (k + 1) / 2;

        if (tid < k && tid + k < length)
            b[tid] += b[tid + k];

        length = k;
        __syncthreads();
    } while (k > 1);

    return b[0];
}

__device__ double reduceSum(double *a, unsigned short length)
{
    const int tid = threadIdx.x;
    unsigned short k = length;

    do
    {
        k = (k + 1) / 2;

        if (tid < k && tid + k < length)
            a[tid] += a[tid + k];

        length = k;
        __syncthreads();
    }
    while (k > 1);

    return a[0];
}

__global__ void kernel_thing(double *ad, int size)
{
    double sum_1, sum_2, sum_3;
    time_t begin, end, t1, t2, t3;

    scratch_space = (double *) &dsmem[0];

    for (int j = 0; j < 1000000; j++) {
        begin = clock();
        sum_1 = NDreduceSum(ad, size);
        end = clock();
    }

    __syncthreads();

    t1 = end - begin;

    begin = clock();

    sum_2 = 0;
    if (threadIdx.x == 0) {
        for (int i = 0; i < size; i++) {
            sum_2 += ad[i];
        }
    }

    __syncthreads();

    end = clock();

    t2 = end - begin;

    __syncthreads();
    begin = clock();
    sum_3 = reduceSum(ad, size);
    end = clock();

    __syncthreads();

    t3 = end - begin;

    if (threadIdx.x == 0) {
        printf("Sum found: %lf and %lf and %lf. In %ld and %ld and %ld ticks.\n", sum_1, sum_2, sum_3, t1, t2, t3);
    }
}

int main(int argc, char **argv)
{
    int i;
    const int size = 512;
    double *a, *ad, *b, *bd;
    double sum_a, sum_b;
    cudaStream_t stream_a, stream_b;
    cudaError_t result;
    cudaEvent_t a_start, a_stop, b_start, b_stop;

    a = (double *) malloc(sizeof(double) * size);
    b = (double *) malloc(sizeof(double) * size);

    srand48(time(0));

    for (i = 0; i < size; i++) {
        a[i] = drand48();
    }

    for (i = 0; i < size; i++) {
        b[i] = drand48();
    }

    sum_a = 0;
    for (i = 0; i < size; i++) {
        sum_a += a[i];
    }

    sum_b = 0;
    for (i = 0; i < size; i++) {
        sum_b += b[i];
    }

    printf("Looking for sum_a %lf\n", sum_a);
    printf("Looking for sum_b %lf\n", sum_b);

    cudaEventCreate(&a_start);
    cudaEventCreate(&b_start);
    cudaEventCreate(&a_stop);
    cudaEventCreate(&b_stop);

    cudaMalloc((void **) &ad, sizeof(double) * size);
    cudaMalloc((void **) &bd, sizeof(double) * size);

    result = cudaStreamCreate(&stream_a);
    result = cudaStreamCreate(&stream_b);

    result = cudaMemcpyAsync(ad, a, sizeof(double) * size, cudaMemcpyHostToDevice, stream_a);
    result = cudaMemcpyAsync(bd, b, sizeof(double) * size, cudaMemcpyHostToDevice, stream_b);

    cudaEventRecord(a_start);
    kernel_thing<<<1, 512, 49152, stream_a>>>(ad, size);
    cudaEventRecord(a_stop);
    cudaEventRecord(b_start);
    kernel_thing<<<1, 512, 49152, stream_b>>>(bd, size);
    cudaEventRecord(b_stop);

    result = cudaMemcpyAsync(a, ad, sizeof(double) * size, cudaMemcpyDeviceToHost, stream_a);
    result = cudaMemcpyAsync(b, bd, sizeof(double) * size, cudaMemcpyDeviceToHost, stream_b);

    cudaEventSynchronize(a_stop);
    cudaEventSynchronize(b_stop);

    float a_ms = 0;
    float b_ms = 0;
    cudaEventElapsedTime(&a_ms, a_start, a_stop);
    cudaEventElapsedTime(&b_ms, b_start, b_stop);

    printf("%lf ms for A.\n", a_ms);
    printf("%lf ms for B.\n", b_ms);

    result = cudaStreamDestroy(stream_a);
    result = cudaStreamDestroy(stream_b);

    if (result != cudaSuccess) {
        printf("I should probably do this after each important operation.\n");
    }

    /*
    printf("Matrix after:\n");
    for (i = 0; i < size; i++) {
        printf("%lf ", a[i]);
    }
    printf("\n");
    */

    free(a);
    free(b);
    cudaFree(ad);
    cudaFree(bd);

    return 0;
}

像这样编译：

CFLAGS = -arch sm_35

CC = nvcc

all: parallel

parallel: parallel.cu
    $(LINK.c) $^ -o $@

clean:
    rm -f *.o core parallel

我正在使用一台Tesla K20X。

当我运行此简单示例时，将得到以下输出：

Looking for sum_a 247.983945
Looking for sum_b 248.033749
Sum found: 247.983945 and 247.983945 and 247.983945. In 3242 and 51600 and 4792 ticks.
Sum found: 248.033749 and 248.033749 and 248.033749. In 3314 and 52000 and 4497 ticks.
4645.079102 ms for A.
4630.725098 ms for B.
Application 577759 resources: utime ~8s, stime ~2s, Rss ~82764, inblocks ~406, outblocks ~967

因此，如您所见，每个内核都能获得正确的结果，大约需要4.5 s，这是我在较早的单内核版本中获得的结果。 大！ 但是，从aprun输出中可以看到，挂墙时间实际上约为10 s，比单内核版本要长得多。 因此，看起来内核要么不是并行启动的，要么是我并没有达到并发内核启动所期望的提速（2倍）。

到tl; dr这个问题：

我的代码示例中缺少任何内容吗？ 内核实际上是并行启动的吗？
我对Tesla K20X期望什么样的提速？ 内核不应该完全并行运行，一次完成两次工作吗？ 我期望有多少个内核可以有效并行运行？

感谢您的帮助。

Answer 1

内核之间的cudaEventRecord操作导致序列化。

现在，您得到的结果是：

4645.079102 ms for A.
4630.725098 ms for B.

由于此序列化，它们是背对背的。

相反，只需计时整个内核启动序列：

cudaEventRecord(a_start);
kernel_thing<<<1, 512, 49152, stream_a>>>(ad, size);
kernel_thing<<<1, 512, 49152, stream_b>>>(bd, size);
cudaEventRecord(a_stop);

而且我认为您会看到(a_start, a_stop)所用的时间与您以前的一个内核（〜4600ms）大致相同，表示或多或少的完全并发。 我使用CUDA 6 RC，将数据从内核复制到主机，而不是从printf复制到主机，并且消除了内核调用之间的cudaEventRecord操作，因此我的总体执行时间约为4.8s。 如果我不修改cudaEventRecord安排，那么我的执行时间约为8.3s

其他一些注意事项：

运行此类测试时，我不会使用内核中的printf 。
当使用malloc分配主机缓冲区时，您将不会在compute和cudaMemcpyAsync出现重叠。 您需要使用cudaHostAlloc 。
我将首先运行并了解并发内核cuda示例。
您可能需要阅读编程指南的相应部分

并发内核启动示例-CUDA

问题描述

1 个解决方案

解决方案1
1 已采纳 2014-04-14 03:14:32

并发内核启动示例-CUDA

问题描述

1 个解决方案

解决方案1 1 已采纳 2014-04-14 03:14:32

解决方案1
1 已采纳 2014-04-14 03:14:32