openmp串行比并行快

Question

The code is parallel but I don't know why it's slower than my serial and when I add the thread to like 7 to 10 the program also gets slower. 代码是并行的，但是我不知道为什么它比我的串行要慢，当我将线程添加到7到10时，程序也会变慢。

I've been trying to figure out what the problem is but it has been difficult for me 我一直在试图找出问题所在，但对我来说却很难

I made the for loop parallel but it seems like its not working. 我将for循环设为并行，但似乎无法正常工作。 I am not receiving any error when I run my code. 运行代码时，我没有收到任何错误。

#include <stdio.h>
#include <math.h>
#include <omp.h>
#include <stdlib.h>


int main(int argc, char *argv[])
{

    int m; 
    int n;
    double tol;// = 0.0001;
    double tstart, tstop;

    int i, j, iter, nthreads;



    m = atoi(argv[1]);
    n = atoi(argv[2]);
    tol = atof(argv[3]);

    double t[m+2][n+2], tnew[m+1][n+1], diff, difmax,priv_difmax;

    printf("%d %d %lf\n",m,n,tol);
    printf("Enter the number of threads (max 10) ");
    scanf("%d",&nthreads);

    omp_set_num_threads(nthreads);
    tstart = omp_get_wtime ();

    //** initialise temperature array*


    #pragma omp parallel for schedule(static)\
    default(shared) private(i,j)
    for (i=0; i <= m+1; i++) {
        for (j=0; j <= n+1; j++) {
            t[i][j] = 30.0;
        }
    }

    //*** fix boundary conditions***


    for (i=1; i <= m; i++) {
        t[i][0] = 20.0;
        t[i][n+1] = 100.0;
    }
    for (j=1; j <= n; j++) {
        t[0][j] = 10.0;
        t[m+1][j] = 140.0;
    }


    //** main loop**


    iter = 0;
    difmax = 1000000.0;
    while (difmax > tol) {
        iter++;

        // **update temperature for next iteration**


        #pragma omp parallel for schedule(static) \
        default(shared) private(i,j)
        for (i=1; i <= m; i++) {
            for (j=1; j <= n; j++) {
                tnew[i][j] = (t[i-1][j]+t[i+1][j]+t[i][j-1]+t[i][j+1])/4.0;
            }
        }

        // **work out maximum difference between old and new temperatures**

        difmax = 0.0;

        #pragma omp parallel default(shared) private(i, j, diff, priv_difmax)
        {
            priv_difmax = 0.0;
            #pragma omp for schedule(static)
            for (i=1; i <= m; i++) {
                for (j=1; j <= n; j++) {
                    diff = fabs(tnew[i][j]-t[i][j]);

                    if (diff > priv_difmax) {
                        priv_difmax = diff;
                    }

                    //** copy new to old temperatures**
                    t[i][j] = tnew[i][j];
                }
                #pragma omp critical 
                if (priv_difmax > difmax){
                    difmax = priv_difmax;
                }
            }
        }

    }
    tstop = omp_get_wtime ();

    // print results

    printf("iter = %d  difmax = %9.11lf", iter, difmax);

    for (i=0; i <= m+1; i++) {
        printf("\n");
        for (j=0; j <= n+1; j++) {
            printf("%3.5lf ", t[i][j]);
        }
    }

    printf("\n");
    tstop = omp_get_wtime ();

    printf("time taken is %4.3lf\n", (tstop-tstart));
    printf("\n");
}

Answer 1

I do not see obvious issues except perhaps in the following code: 除了以下代码外，我看不到明显的问题：

#pragma omp parallel default(shared) private(i, j, diff, priv_difmax)
{
    priv_difmax = 0.0;
    #pragma omp for schedule(static)
    for (i=1; i <= m; i++) {
        for (j=1; j <= n; j++) {
            diff = fabs(tnew[i][j]-t[i][j]);
                if (diff > priv_difmax) {
                priv_difmax = diff;
            }
                //** copy new to old temperatures**
            t[i][j] = tnew[i][j];
        }
        #pragma omp critical 
        if (priv_difmax > difmax){
            difmax = priv_difmax;
        }
    }
}

The reduction part, copying priv_difmax to difmax , should be moved out of the loop so that threads pass through the critical section only once, and not at each iteration of the outer loop. 减少部分，将priv_difmax复制到difmax ，应该移出循环，以便线程仅通过critical部分一次，而不是在外循环的每次迭代中都通过。

#pragma omp parallel default(shared) private(i, j, diff, priv_difmax)
{
    priv_difmax = 0.0;
    #pragma omp for schedule(static) nowait //no need to wait after the loop
    for (i=1; i <= m; i++) {
        for (j=1; j <= n; j++) {
            diff = fabs(tnew[i][j]-t[i][j]);
                if (diff > priv_difmax) {
                priv_difmax = diff;
            }
                //** copy new to old temperatures**
            t[i][j] = tnew[i][j];
        }
    }
    // Finish the loop first, then update difmax
    #pragma omp critical 
    if (priv_difmax > difmax){
        difmax = priv_difmax;
    }
} //Implicit barrier

Now, parallelization has an overhead cost and speedup may be expected for large values of m and n only. 现在，并行化具有开销成本，并且仅对于较大的m和n值，可以预期加速。 The problem you are considering is maybe too small. 您正在考虑的问题可能太小。 On way to reduce the overhead would be to merge the two parallel constructs so that the pool of threads does not have to be spawned twice. 减少开销的方法是合并两个parallel结构，这样就不必产生两次线程池。 Or even better, put the while loop inside the parallel construct, so that we only have to synchronize existing threads at each iteration, rather than create and destroy them: 甚至更好的是，将while循环放入parallel构造中，这样我们只需要在每次迭代时同步现有线程，而不必创建和销毁它们：

difmax=1000000.0;
#pragma omp parallel default(shared) private(i, j, diff, priv_difmax)
while (difmax > tol) {

    // have one thread reset difmax and increment iter
    #pragma omp single nowait
    iter++,difmax=0.0;

    // loop to update tnew - distributed among threads
    #pragma omp parallel for schedule(static) \
    default(shared) private(i,j)
    for (i=1; i <= m; i++) {
        for (j=1; j <= n; j++) {
            tnew[i][j] = (t[i-1][j]+t[i+1][j]+t[i][j-1]+t[i][j+1])/4.0;
        }
    } //implicit barrier here

    // each thread resets its private difmax
    priv_difmax=0.0;

    // loop to compute difmax - distributed among threads
    #pragma omp for schedule(static) nowait
    for (i=1; i <= m; i++) {
        for (j=1; j <= n; j++) {
            diff = fabs(tnew[i][j]-t[i][j]);

            if (diff > priv_difmax) {
                priv_difmax = diff;
            }

            //** copy new to old temperatures**
            t[i][j] = tnew[i][j];
        }
    }

    // each thread now updates difmax if needed, one at a time
    #pragma omp critical 
    if (priv_difmax > difmax){
        difmax = priv_difmax;
    }

    // put a barrier here to make sure that diffmax have been updated 
    // before any thread tests the condition for next iteration of the
    // while-loop condition
    #pragma omp barrier
}

The best way to compare a how a code runs in serial and parallel is to compile it with and without OpenMP support (eg with gcc, compile with and without the -fopenmp compiler and linker flag). 比较代码以串行和并行方式运行的最好方法是在有和没有OpenMP支持的情况下进行编译（例如，使用gcc，使用和不使用-fopenmp编译器和链接器标志进行编译）。 This will help pointing out whether the issue is actually with parallelization, or with other modification between the original serial code and the "parallel-ready" version. 这将有助于指出问题出在实际上是并行化还是原始串行代码与“并行就绪”版本之间的其他修改。

The idea is to know where time is lost when going from original serial code to parallel code (compiled without parallel support) to parallel code (compiled with OpenMP) 这个想法是要知道从原始的串行代码到并行代码（没有并行支持而编译）到并行代码（与OpenMP编译）之间浪费了时间。

Some preprocessing header needs to be used, because the compiler will not recognize functions like omp_get_thread_num() without OpenMP support. 需要使用一些预处理头，因为在没有OpenMP支持的情况下，编译器将无法识别omp_get_thread_num()类的函数。 omp_get_wtime() should not be used either; omp_get_wtime()也不应使用； since all your time leasurements are done out of parallel regions, there is no need to use that specific function, and a call to time() will be accurate (this requires to #include <time.h> ). 由于所有时间释放都是在并行区域之外完成的，因此无需使用该特定函数，并且对time()的调用将是准确的（这需要#include <time.h> ）。

// This part is necessary for the code to run whether it is compiled or not with OpenMP
#ifdef _OPENMP
   #include <omp.h>
#else
   # ifndef _ESCAPE_OMPENMP
      #define omp_get_num_threads() 1
      #define omp_get_thread_num() 0
      #define omp_get_max_threads() 0
      #define _ESCAPE_OMPENMP
   #endif
#endif

openmp串行比并行快

问题描述

1 个解决方案

解决方案1
0 2018-12-10 09:35:45

openmp串行比并行快

问题描述

1 个解决方案

解决方案1 0 2018-12-10 09:35:45

解决方案1
0 2018-12-10 09:35:45