openmp serial faster than parallel

Question

The code is parallel but I don't know why it's slower than my serial and when I add the thread to like 7 to 10 the program also gets slower.

I've been trying to figure out what the problem is but it has been difficult for me

I made the for loop parallel but it seems like its not working. I am not receiving any error when I run my code.

#include <stdio.h>
#include <math.h>
#include <omp.h>
#include <stdlib.h>


int main(int argc, char *argv[])
{

    int m; 
    int n;
    double tol;// = 0.0001;
    double tstart, tstop;

    int i, j, iter, nthreads;



    m = atoi(argv[1]);
    n = atoi(argv[2]);
    tol = atof(argv[3]);

    double t[m+2][n+2], tnew[m+1][n+1], diff, difmax,priv_difmax;

    printf("%d %d %lf\n",m,n,tol);
    printf("Enter the number of threads (max 10) ");
    scanf("%d",&nthreads);

    omp_set_num_threads(nthreads);
    tstart = omp_get_wtime ();

    //** initialise temperature array*


    #pragma omp parallel for schedule(static)\
    default(shared) private(i,j)
    for (i=0; i <= m+1; i++) {
        for (j=0; j <= n+1; j++) {
            t[i][j] = 30.0;
        }
    }

    //*** fix boundary conditions***


    for (i=1; i <= m; i++) {
        t[i][0] = 20.0;
        t[i][n+1] = 100.0;
    }
    for (j=1; j <= n; j++) {
        t[0][j] = 10.0;
        t[m+1][j] = 140.0;
    }


    //** main loop**


    iter = 0;
    difmax = 1000000.0;
    while (difmax > tol) {
        iter++;

        // **update temperature for next iteration**


        #pragma omp parallel for schedule(static) \
        default(shared) private(i,j)
        for (i=1; i <= m; i++) {
            for (j=1; j <= n; j++) {
                tnew[i][j] = (t[i-1][j]+t[i+1][j]+t[i][j-1]+t[i][j+1])/4.0;
            }
        }

        // **work out maximum difference between old and new temperatures**

        difmax = 0.0;

        #pragma omp parallel default(shared) private(i, j, diff, priv_difmax)
        {
            priv_difmax = 0.0;
            #pragma omp for schedule(static)
            for (i=1; i <= m; i++) {
                for (j=1; j <= n; j++) {
                    diff = fabs(tnew[i][j]-t[i][j]);

                    if (diff > priv_difmax) {
                        priv_difmax = diff;
                    }

                    //** copy new to old temperatures**
                    t[i][j] = tnew[i][j];
                }
                #pragma omp critical 
                if (priv_difmax > difmax){
                    difmax = priv_difmax;
                }
            }
        }

    }
    tstop = omp_get_wtime ();

    // print results

    printf("iter = %d  difmax = %9.11lf", iter, difmax);

    for (i=0; i <= m+1; i++) {
        printf("\n");
        for (j=0; j <= n+1; j++) {
            printf("%3.5lf ", t[i][j]);
        }
    }

    printf("\n");
    tstop = omp_get_wtime ();

    printf("time taken is %4.3lf\n", (tstop-tstart));
    printf("\n");
}

Answer 1

I do not see obvious issues except perhaps in the following code:

#pragma omp parallel default(shared) private(i, j, diff, priv_difmax)
{
    priv_difmax = 0.0;
    #pragma omp for schedule(static)
    for (i=1; i <= m; i++) {
        for (j=1; j <= n; j++) {
            diff = fabs(tnew[i][j]-t[i][j]);
                if (diff > priv_difmax) {
                priv_difmax = diff;
            }
                //** copy new to old temperatures**
            t[i][j] = tnew[i][j];
        }
        #pragma omp critical 
        if (priv_difmax > difmax){
            difmax = priv_difmax;
        }
    }
}

The reduction part, copying priv_difmax to difmax , should be moved out of the loop so that threads pass through the critical section only once, and not at each iteration of the outer loop.

#pragma omp parallel default(shared) private(i, j, diff, priv_difmax)
{
    priv_difmax = 0.0;
    #pragma omp for schedule(static) nowait //no need to wait after the loop
    for (i=1; i <= m; i++) {
        for (j=1; j <= n; j++) {
            diff = fabs(tnew[i][j]-t[i][j]);
                if (diff > priv_difmax) {
                priv_difmax = diff;
            }
                //** copy new to old temperatures**
            t[i][j] = tnew[i][j];
        }
    }
    // Finish the loop first, then update difmax
    #pragma omp critical 
    if (priv_difmax > difmax){
        difmax = priv_difmax;
    }
} //Implicit barrier

Now, parallelization has an overhead cost and speedup may be expected for large values of m and n only. The problem you are considering is maybe too small. On way to reduce the overhead would be to merge the two parallel constructs so that the pool of threads does not have to be spawned twice. Or even better, put the while loop inside the parallel construct, so that we only have to synchronize existing threads at each iteration, rather than create and destroy them:

difmax=1000000.0;
#pragma omp parallel default(shared) private(i, j, diff, priv_difmax)
while (difmax > tol) {

    // have one thread reset difmax and increment iter
    #pragma omp single nowait
    iter++,difmax=0.0;

    // loop to update tnew - distributed among threads
    #pragma omp parallel for schedule(static) \
    default(shared) private(i,j)
    for (i=1; i <= m; i++) {
        for (j=1; j <= n; j++) {
            tnew[i][j] = (t[i-1][j]+t[i+1][j]+t[i][j-1]+t[i][j+1])/4.0;
        }
    } //implicit barrier here

    // each thread resets its private difmax
    priv_difmax=0.0;

    // loop to compute difmax - distributed among threads
    #pragma omp for schedule(static) nowait
    for (i=1; i <= m; i++) {
        for (j=1; j <= n; j++) {
            diff = fabs(tnew[i][j]-t[i][j]);

            if (diff > priv_difmax) {
                priv_difmax = diff;
            }

            //** copy new to old temperatures**
            t[i][j] = tnew[i][j];
        }
    }

    // each thread now updates difmax if needed, one at a time
    #pragma omp critical 
    if (priv_difmax > difmax){
        difmax = priv_difmax;
    }

    // put a barrier here to make sure that diffmax have been updated 
    // before any thread tests the condition for next iteration of the
    // while-loop condition
    #pragma omp barrier
}

The best way to compare a how a code runs in serial and parallel is to compile it with and without OpenMP support (eg with gcc, compile with and without the -fopenmp compiler and linker flag). This will help pointing out whether the issue is actually with parallelization, or with other modification between the original serial code and the "parallel-ready" version.

The idea is to know where time is lost when going from original serial code to parallel code (compiled without parallel support) to parallel code (compiled with OpenMP)

Some preprocessing header needs to be used, because the compiler will not recognize functions like omp_get_thread_num() without OpenMP support. omp_get_wtime() should not be used either; since all your time leasurements are done out of parallel regions, there is no need to use that specific function, and a call to time() will be accurate (this requires to #include <time.h> ).

// This part is necessary for the code to run whether it is compiled or not with OpenMP
#ifdef _OPENMP
   #include <omp.h>
#else
   # ifndef _ESCAPE_OMPENMP
      #define omp_get_num_threads() 1
      #define omp_get_thread_num() 0
      #define omp_get_max_threads() 0
      #define _ESCAPE_OMPENMP
   #endif
#endif

openmp serial faster than parallel

Question

1 answers

solution1
0 2018-12-10 09:35:45

openmp serial faster than parallel

Question

1 answers

solution1 0 2018-12-10 09:35:45

solution1
0 2018-12-10 09:35:45