使用 OpenMP 并行执行比 C 中的串行执行花费更长的时间？

Question

The serial version takes less time than the parallel one.串行版本比并行版本花费的时间更少。

/*Serial Version*/
double start = omp_get_wtime();

for (i = 0; i < 1100; i++) {
    for (j = i; j < i + 4; j++) {
        fprintf(new_file, "%f  ", S[j]);
    }
    fprintf(new_file, "\n");
    m = compute_m(S + i, 4);
    find_min_max(S + i, 4, &min, &max);

    S_i = inf(m, min, b); 
    S_s = sup(m, max, b); 

    if (S[i + 2] < S_i)
        Res[i] = S_i;
    else if (S[i + 2] > S_s)
        Res[i] = S_s;
    else
        Res[i] = ECG[i + 2];
    fprintf(output_f, "%f\n", Res[i]);
}

    

    double end = omp_get_wtime();
    printf("\n ------------- TIMING :: Serial Version -------------- ");
    printf("\nStart = %.16g\nend = %.16g\nDiff_time = %.16g\n", start, end, end - start);

#Parallel version 
    double start = omp_get_wtime();
#pragma omp parallel for
    for (i = 0; i < 1100; i++) {
#pragma omp parallel for
        for (j = i; j < i + 4; j++) {
            serial code ...
        }
        serial code ...
    }
double end = omp_get_wtime();
    printf("\n ------------- TIMING :: Serial Version -------------- ");
    printf("\nStart = %.16g\nend = %.16g\nDiff_time = %.16g\n", start, end, end - start);

I have tried multiple times, the serial execution is always faster why?我试过多次，为什么串行执行总是更快？

why is serial execution faster here?为什么这里的串行执行速度更快？ am I calculation the execution time in the right way?我是否以正确的方式计算执行时间？

Answer 1

Assuming that compute_m does not write to S and that find_min_max does not write to S_i or read from min and max, this should work.假设 compute_m 不写入 S 并且 find_min_max 不写入 S_i 或从 min 和 max 读取，这应该可以工作。

/*Parallel Version A*/
double start = omp_get_wtime();

const int nThreads = omp_get_max_threads();

#pragma omp parallel sections num_threads(2) default(none) shared(S, Res, ECG, b, min, max, m, S_i, S_s, nThreads)
{
#pragma omp section
    for (i = 0; i < 1100; i++) {
        for (j = i; j < i + 4; j++) {
            fprintf(new_file, "%f  ", S[j]);
        }
        fprintf(new_file, "\n");
    }
#pragma omp section
    {
#pragma omp parallel for num_threads(nThreads - 1) default(none) shared(S, Res, ECG, b) private(min, max, m, S_i, S_s)
        for (i = 0; i < 1100; i++) {
            m = compute_m(S + i, 4);
            find_min_max(S + i, 4, &min, &max);

            S_i = inf(m, min, b); 
            S_s = sup(m, max, b); 

            if (S[i + 2] < S_i)
                Res[i] = S_i;
            else if (S[i + 2] > S_s)
                Res[i] = S_s;
            else
                Res[i] = ECG[i + 2];
        }
        for (i = 0; i < 1100; i++) {
            fprintf(output_f, "%f\n", Res[i]);
        }
    }
}

double end = omp_get_wtime();
printf("\n ------------- TIMING :: Parallel Version A -------------- ");
printf("\nStart = %.16g\nend = %.16g\nDiff_time = %.16g\n", start, end, end - start);

Another a bit less complicated solution would be this one另一个不太复杂的解决方案是这个

/*Parallel Version B*/
double start = omp_get_wtime();

#pragma omp parallel default(none) shared(S, Res, ECG, b) private(min, max, m, S_i, S_s)
{
#pragma omp for 
    for (i = 0; i < 1100; i++) {
        m = compute_m(S + i, 4);
        find_min_max(S + i, 4, &min, &max);

        S_i = inf(m, min, b); 
        S_s = sup(m, max, b); 

        if (S[i + 2] < S_i)
            Res[i] = S_i;
        else if (S[i + 2] > S_s)
            Res[i] = S_s;
        else
            Res[i] = ECG[i + 2];
    }

#pragma omp sections
    {
#pragma omp section
        for (i = 0; i < 1100; i++) {
            for (j = i; j < i + 4; j++) {
                fprintf(new_file, "%f  ", S[j]);
            }
            fprintf(new_file, "\n");
        }
#pragma omp section
        for (i = 0; i < 1100; i++) {
            fprintf(output_f, "%f\n", Res[i]);
        }
    }
}

double end = omp_get_wtime();
printf("\n ------------- TIMING :: Parallel Version B -------------- ");
printf("\nStart = %.16g\nend = %.16g\nDiff_time = %.16g\n", start, end, end - start);

In the first version the calculation happens in parallel with writing out S, in the second version the calculations happen first, before S and Res are written to file in parallel.在第一个版本中，计算与写出 S 并行发生，在第二个版本中，计算首先发生，然后 S 和 Res 并行写入文件。 I wouldn't bet on which one is faster, so just try it out on your hardware.我不会打赌哪个更快，所以只需在您的硬件上尝试一下。

These can still be slower than the serial version, because spawning threads always has some overhead.这些仍然可能比串行版本慢，因为产生线程总是有一些开销。

使用 OpenMP 并行执行比 C 中的串行执行花费更长的时间？

问题描述

1 个解决方案

解决方案1
0 已采纳 2020-08-13 14:42:37

使用 OpenMP 并行执行比 C 中的串行执行花费更长的时间？

问题描述

1 个解决方案

解决方案1 0 已采纳 2020-08-13 14:42:37

解决方案1
0 已采纳 2020-08-13 14:42:37