openmp串行比並行快

Question

代碼是並行的，但是我不知道為什么它比我的串行要慢，當我將線程添加到7到10時，程序也會變慢。

我一直在試圖找出問題所在，但對我來說卻很難

我將for循環設為並行，但似乎無法正常工作。 運行代碼時，我沒有收到任何錯誤。

#include <stdio.h>
#include <math.h>
#include <omp.h>
#include <stdlib.h>


int main(int argc, char *argv[])
{

    int m; 
    int n;
    double tol;// = 0.0001;
    double tstart, tstop;

    int i, j, iter, nthreads;



    m = atoi(argv[1]);
    n = atoi(argv[2]);
    tol = atof(argv[3]);

    double t[m+2][n+2], tnew[m+1][n+1], diff, difmax,priv_difmax;

    printf("%d %d %lf\n",m,n,tol);
    printf("Enter the number of threads (max 10) ");
    scanf("%d",&nthreads);

    omp_set_num_threads(nthreads);
    tstart = omp_get_wtime ();

    //** initialise temperature array*


    #pragma omp parallel for schedule(static)\
    default(shared) private(i,j)
    for (i=0; i <= m+1; i++) {
        for (j=0; j <= n+1; j++) {
            t[i][j] = 30.0;
        }
    }

    //*** fix boundary conditions***


    for (i=1; i <= m; i++) {
        t[i][0] = 20.0;
        t[i][n+1] = 100.0;
    }
    for (j=1; j <= n; j++) {
        t[0][j] = 10.0;
        t[m+1][j] = 140.0;
    }


    //** main loop**


    iter = 0;
    difmax = 1000000.0;
    while (difmax > tol) {
        iter++;

        // **update temperature for next iteration**


        #pragma omp parallel for schedule(static) \
        default(shared) private(i,j)
        for (i=1; i <= m; i++) {
            for (j=1; j <= n; j++) {
                tnew[i][j] = (t[i-1][j]+t[i+1][j]+t[i][j-1]+t[i][j+1])/4.0;
            }
        }

        // **work out maximum difference between old and new temperatures**

        difmax = 0.0;

        #pragma omp parallel default(shared) private(i, j, diff, priv_difmax)
        {
            priv_difmax = 0.0;
            #pragma omp for schedule(static)
            for (i=1; i <= m; i++) {
                for (j=1; j <= n; j++) {
                    diff = fabs(tnew[i][j]-t[i][j]);

                    if (diff > priv_difmax) {
                        priv_difmax = diff;
                    }

                    //** copy new to old temperatures**
                    t[i][j] = tnew[i][j];
                }
                #pragma omp critical 
                if (priv_difmax > difmax){
                    difmax = priv_difmax;
                }
            }
        }

    }
    tstop = omp_get_wtime ();

    // print results

    printf("iter = %d  difmax = %9.11lf", iter, difmax);

    for (i=0; i <= m+1; i++) {
        printf("\n");
        for (j=0; j <= n+1; j++) {
            printf("%3.5lf ", t[i][j]);
        }
    }

    printf("\n");
    tstop = omp_get_wtime ();

    printf("time taken is %4.3lf\n", (tstop-tstart));
    printf("\n");
}

Answer 1

除了以下代碼外，我看不到明顯的問題：

#pragma omp parallel default(shared) private(i, j, diff, priv_difmax)
{
    priv_difmax = 0.0;
    #pragma omp for schedule(static)
    for (i=1; i <= m; i++) {
        for (j=1; j <= n; j++) {
            diff = fabs(tnew[i][j]-t[i][j]);
                if (diff > priv_difmax) {
                priv_difmax = diff;
            }
                //** copy new to old temperatures**
            t[i][j] = tnew[i][j];
        }
        #pragma omp critical 
        if (priv_difmax > difmax){
            difmax = priv_difmax;
        }
    }
}

減少部分，將priv_difmax復制到difmax ，應該移出循環，以便線程僅通過critical部分一次，而不是在外循環的每次迭代中都通過。

#pragma omp parallel default(shared) private(i, j, diff, priv_difmax)
{
    priv_difmax = 0.0;
    #pragma omp for schedule(static) nowait //no need to wait after the loop
    for (i=1; i <= m; i++) {
        for (j=1; j <= n; j++) {
            diff = fabs(tnew[i][j]-t[i][j]);
                if (diff > priv_difmax) {
                priv_difmax = diff;
            }
                //** copy new to old temperatures**
            t[i][j] = tnew[i][j];
        }
    }
    // Finish the loop first, then update difmax
    #pragma omp critical 
    if (priv_difmax > difmax){
        difmax = priv_difmax;
    }
} //Implicit barrier

現在，並行化具有開銷成本，並且僅對於較大的m和n值，可以預期加速。 您正在考慮的問題可能太小。 減少開銷的方法是合並兩個parallel結構，這樣就不必產生兩次線程池。 甚至更好的是，將while循環放入parallel構造中，這樣我們只需要在每次迭代時同步現有線程，而不必創建和銷毀它們：

difmax=1000000.0;
#pragma omp parallel default(shared) private(i, j, diff, priv_difmax)
while (difmax > tol) {

    // have one thread reset difmax and increment iter
    #pragma omp single nowait
    iter++,difmax=0.0;

    // loop to update tnew - distributed among threads
    #pragma omp parallel for schedule(static) \
    default(shared) private(i,j)
    for (i=1; i <= m; i++) {
        for (j=1; j <= n; j++) {
            tnew[i][j] = (t[i-1][j]+t[i+1][j]+t[i][j-1]+t[i][j+1])/4.0;
        }
    } //implicit barrier here

    // each thread resets its private difmax
    priv_difmax=0.0;

    // loop to compute difmax - distributed among threads
    #pragma omp for schedule(static) nowait
    for (i=1; i <= m; i++) {
        for (j=1; j <= n; j++) {
            diff = fabs(tnew[i][j]-t[i][j]);

            if (diff > priv_difmax) {
                priv_difmax = diff;
            }

            //** copy new to old temperatures**
            t[i][j] = tnew[i][j];
        }
    }

    // each thread now updates difmax if needed, one at a time
    #pragma omp critical 
    if (priv_difmax > difmax){
        difmax = priv_difmax;
    }

    // put a barrier here to make sure that diffmax have been updated 
    // before any thread tests the condition for next iteration of the
    // while-loop condition
    #pragma omp barrier
}

比較代碼以串行和並行方式運行的最好方法是在有和沒有OpenMP支持的情況下進行編譯（例如，使用gcc，使用和不使用-fopenmp編譯器和鏈接器標志進行編譯）。 這將有助於指出問題出在實際上是並行化還是原始串行代碼與“並行就緒”版本之間的其他修改。

這個想法是要知道從原始的串行代碼到並行代碼（沒有並行支持而編譯）到並行代碼（與OpenMP編譯）之間浪費了時間。

需要使用一些預處理頭，因為在沒有OpenMP支持的情況下，編譯器將無法識別omp_get_thread_num()類的函數。 omp_get_wtime()也不應使用； 由於所有時間釋放都是在並行區域之外完成的，因此無需使用該特定函數，並且對time()的調用將是准確的（這需要#include <time.h> ）。

// This part is necessary for the code to run whether it is compiled or not with OpenMP
#ifdef _OPENMP
   #include <omp.h>
#else
   # ifndef _ESCAPE_OMPENMP
      #define omp_get_num_threads() 1
      #define omp_get_thread_num() 0
      #define omp_get_max_threads() 0
      #define _ESCAPE_OMPENMP
   #endif
#endif

openmp串行比並行快

問題描述

1 個解決方案

解決方案1
0 2018-12-10 09:35:45

openmp串行比並行快

問題描述

1 個解決方案

解決方案1 0 2018-12-10 09:35:45

解決方案1
0 2018-12-10 09:35:45