簡體   English   中英

openmp串行比並行快

[英]openmp serial faster than parallel

代碼是並行的,但是我不知道為什么它比我的串行要慢,當我將線程添加到7到10時,程序也會變慢。

我一直在試圖找出問題所在,但對我來說卻很難

我將for循環設為並行,但似乎無法正常工作。 運行代碼時,我沒有收到任何錯誤。

#include <stdio.h>
#include <math.h>
#include <omp.h>
#include <stdlib.h>


int main(int argc, char *argv[])
{

    int m; 
    int n;
    double tol;// = 0.0001;
    double tstart, tstop;

    int i, j, iter, nthreads;



    m = atoi(argv[1]);
    n = atoi(argv[2]);
    tol = atof(argv[3]);

    double t[m+2][n+2], tnew[m+1][n+1], diff, difmax,priv_difmax;

    printf("%d %d %lf\n",m,n,tol);
    printf("Enter the number of threads (max 10) ");
    scanf("%d",&nthreads);

    omp_set_num_threads(nthreads);
    tstart = omp_get_wtime ();

    //** initialise temperature array*


    #pragma omp parallel for schedule(static)\
    default(shared) private(i,j)
    for (i=0; i <= m+1; i++) {
        for (j=0; j <= n+1; j++) {
            t[i][j] = 30.0;
        }
    }

    //*** fix boundary conditions***


    for (i=1; i <= m; i++) {
        t[i][0] = 20.0;
        t[i][n+1] = 100.0;
    }
    for (j=1; j <= n; j++) {
        t[0][j] = 10.0;
        t[m+1][j] = 140.0;
    }


    //** main loop**


    iter = 0;
    difmax = 1000000.0;
    while (difmax > tol) {
        iter++;

        // **update temperature for next iteration**


        #pragma omp parallel for schedule(static) \
        default(shared) private(i,j)
        for (i=1; i <= m; i++) {
            for (j=1; j <= n; j++) {
                tnew[i][j] = (t[i-1][j]+t[i+1][j]+t[i][j-1]+t[i][j+1])/4.0;
            }
        }

        // **work out maximum difference between old and new temperatures**

        difmax = 0.0;

        #pragma omp parallel default(shared) private(i, j, diff, priv_difmax)
        {
            priv_difmax = 0.0;
            #pragma omp for schedule(static)
            for (i=1; i <= m; i++) {
                for (j=1; j <= n; j++) {
                    diff = fabs(tnew[i][j]-t[i][j]);

                    if (diff > priv_difmax) {
                        priv_difmax = diff;
                    }

                    //** copy new to old temperatures**
                    t[i][j] = tnew[i][j];
                }
                #pragma omp critical 
                if (priv_difmax > difmax){
                    difmax = priv_difmax;
                }
            }
        }

    }
    tstop = omp_get_wtime ();

    // print results

    printf("iter = %d  difmax = %9.11lf", iter, difmax);

    for (i=0; i <= m+1; i++) {
        printf("\n");
        for (j=0; j <= n+1; j++) {
            printf("%3.5lf ", t[i][j]);
        }
    }

    printf("\n");
    tstop = omp_get_wtime ();

    printf("time taken is %4.3lf\n", (tstop-tstart));
    printf("\n");
}

除了以下代碼外,我看不到明顯的問題:

#pragma omp parallel default(shared) private(i, j, diff, priv_difmax)
{
    priv_difmax = 0.0;
    #pragma omp for schedule(static)
    for (i=1; i <= m; i++) {
        for (j=1; j <= n; j++) {
            diff = fabs(tnew[i][j]-t[i][j]);
                if (diff > priv_difmax) {
                priv_difmax = diff;
            }
                //** copy new to old temperatures**
            t[i][j] = tnew[i][j];
        }
        #pragma omp critical 
        if (priv_difmax > difmax){
            difmax = priv_difmax;
        }
    }
}

減少部分,將priv_difmax復制到difmax ,應該移出循環,以便線程僅通過critical部分一次,而不是在外循環的每次迭代中都通過。

#pragma omp parallel default(shared) private(i, j, diff, priv_difmax)
{
    priv_difmax = 0.0;
    #pragma omp for schedule(static) nowait //no need to wait after the loop
    for (i=1; i <= m; i++) {
        for (j=1; j <= n; j++) {
            diff = fabs(tnew[i][j]-t[i][j]);
                if (diff > priv_difmax) {
                priv_difmax = diff;
            }
                //** copy new to old temperatures**
            t[i][j] = tnew[i][j];
        }
    }
    // Finish the loop first, then update difmax
    #pragma omp critical 
    if (priv_difmax > difmax){
        difmax = priv_difmax;
    }
} //Implicit barrier

現在,並行化具有開銷成本,並且僅對於較大的m和n值,可以預期加速。 您正在考慮的問題可能太小。 減少開銷的方法是合並兩個parallel結構,這樣就不必產生兩次線程池。 甚至更好的是,將while循環放入parallel構造中,這樣我們只需要在每次迭代時同步現有線程,而不必創建和銷毀它們:

difmax=1000000.0;
#pragma omp parallel default(shared) private(i, j, diff, priv_difmax)
while (difmax > tol) {

    // have one thread reset difmax and increment iter
    #pragma omp single nowait
    iter++,difmax=0.0;

    // loop to update tnew - distributed among threads
    #pragma omp parallel for schedule(static) \
    default(shared) private(i,j)
    for (i=1; i <= m; i++) {
        for (j=1; j <= n; j++) {
            tnew[i][j] = (t[i-1][j]+t[i+1][j]+t[i][j-1]+t[i][j+1])/4.0;
        }
    } //implicit barrier here

    // each thread resets its private difmax
    priv_difmax=0.0;

    // loop to compute difmax - distributed among threads
    #pragma omp for schedule(static) nowait
    for (i=1; i <= m; i++) {
        for (j=1; j <= n; j++) {
            diff = fabs(tnew[i][j]-t[i][j]);

            if (diff > priv_difmax) {
                priv_difmax = diff;
            }

            //** copy new to old temperatures**
            t[i][j] = tnew[i][j];
        }
    }

    // each thread now updates difmax if needed, one at a time
    #pragma omp critical 
    if (priv_difmax > difmax){
        difmax = priv_difmax;
    }

    // put a barrier here to make sure that diffmax have been updated 
    // before any thread tests the condition for next iteration of the
    // while-loop condition
    #pragma omp barrier
}

比較代碼以串行和並行方式運行的最好方法是在有和沒有OpenMP支持的情況下進行編譯(例如,使用gcc,使用和不使用-fopenmp編譯器和鏈接器標志進行編譯)。 這將有助於指出問題出在實際上是並行化還是原始串行代碼與“並行就緒”版本之間的其他修改。

這個想法是要知道從原始的串行代碼並行代碼(沒有並行支持而編譯)並行代碼(與OpenMP編譯)之間浪費了時間。

需要使用一些預處理頭,因為在沒有OpenMP支持的情況下,編譯器將無法識別omp_get_thread_num()類的函數。 omp_get_wtime()也不應使用; 由於所有時間釋放都是在並行區域之外完成的,因此無需使用該特定函數,並且對time()的調用將是准確的(這需要#include <time.h> )。

// This part is necessary for the code to run whether it is compiled or not with OpenMP
#ifdef _OPENMP
   #include <omp.h>
#else
   # ifndef _ESCAPE_OMPENMP
      #define omp_get_num_threads() 1
      #define omp_get_thread_num() 0
      #define omp_get_max_threads() 0
      #define _ESCAPE_OMPENMP
   #endif
#endif

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM