简体   繁体   中英

why my OpenMP C++ code is slower than a serial code?

#include <iostream>
#include <iomanip>
#include <fstream> 
#include <sstream>
#include <string>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>
using namespace std;


void output(float a[], float X[], float Y[], int I, int J)
{
  ofstream ft;
  int i;

  ft.open("flow.dat");
  ft<<"variables=\"x\",\"y\",\"a\""<<"\n"
    <<"zone f=point"<<"\n"
    <<"I="<<I<<",J="<<J<<"\n"
    <<endl;

  for(int i=0;i<I*J;i++)
    {
    ft<<setiosflags(ios::scientific)
      <<X[i]<<" "<<Y[i]<<" "<<a[i]<<endl;
    }

  ft.close();

}

void set(float a[], float X[], float Y[], int I, int J, float hx, float hy)
{
  for(int j=0;j<J;j++)
    for(int i=0;i<I;i++)
      {
        int iC=j*I+i;
        X[iC]=i*hx;
        Y[iC]=j*hy;
        a[iC]=0.0;
        if(j==J-1) a[iC]=1.0;
      }
}

void difference_serial(float a[],  int I, int J, const float hx, const float hy)
{
  const float aC=(hx*hx+hy*hy)*2;
  const float aX=hy*hy;
  const float aY=hx*hx;
  for(int j=1;j<J-1;j++)
    for(int i=1;i<I-1;i++)
      {
        int iC=j*I+i;
        int iL=iC-1;
        int iR=iC+1;
        int iU=iC+I;
        int iD=iC-I;
        a[iC]=(aX*(a[iL]+a[iR])+aY*(a[iU]+a[iD]))/aC;
      }


}

void difference_omp(float a[],  int I, int J, const float hx, const float hy)
{
  const float aC=(hx*hx+hy*hy)*2;
  const float aX=hy*hy;
  const float aY=hx*hx;

  int i,j,iC,iL,iR,iU,iD;
#pragma omp parallel for private(i,j,iC,iL,iR,iU,iD) shared(a,I,J) schedule(dynamic) 
  for( j=1;j<J-1;j++)
    for( i=1;i<I-1;i++)
      {
        iC=j*I+i;
        iL=iC-1;
        iR=iC+1;
        iU=iC+I;
        iD=iC-I;
        a[iC]=(aX*(a[iL]+a[iR])+aY*(a[iU]+a[iD]))/aC;
      }
}

int main()
{
  const int I=129;
  const int J=129;
  const int N=I*J;
  const float hx=1.0/(I-1);
  const float hy=1.0/(J-1);

  float *a=new float[N];
  float *X=new float[N];
  float *Y=new float[N];

  //set the grid and flow
  set(a,X,Y,I,J,hx,hy);

  //iteation
  clock_t start=clock();
  for(int it=0;it<10000;it++)
    difference_serial(a,I,J,hx,hy);
  clock_t end=clock();
  printf("Serial time=%f\n",(float)(end-start)/CLOCKS_PER_SEC);


  set(a,X,Y,I,J,hx,hy);
  clock_t start2=clock();
  for(int it2=0;it2<10000;it2++)
    difference_omp(a,I,J,hx,hy);
  clock_t end2=clock();
  printf("Omp time=%f\n",(float)(end2-start2)/CLOCKS_PER_SEC);

  //output
  output(a,X,Y,I,J);

  //free memory
  delete[] a;
  delete[] X;
  delete[] Y;
}

I write a piece of code to solve a very simple Laplace equation in two dimensions. Try to compare the serial code and OpenMP code

I tried to compile the code with g++ tmp.cpp -fopenmp

and get the very strange result output: Serial time=1.620000 Omp time=9.820000

Is there anyone can help me to figure out what's the reason behind this and how to correct the OpenMP code.

I ran into funny results.

luk32:~/projects/tests$ g++ -fopenmp -lgomp ./laplace.cpp 
luk32:~/projects/tests$ ./a.out 
Omp time=13.000000
Serial time=3.000000
luk32:~/projects/tests$ g++ -O3 -fopenmp -lgomp ./laplace.cpp 
luk32:~/projects/tests$ ./a.out 
Omp time=31.000000
Serial time=1.000000

So with O3 the time worsened for OpenMP and dropped forthe serial version. My guess is that the problem instance is so small that the actual overhead from invoking parallel region is manifesting here.

You are trying to parallelize something that is taking 1.5s / 10k = 0.15 millisecond on your PC. Initalizing thread pool and scheduling has its overhead especially with schedule(dynamic)

I will try to do some testing to confirm. Not sure if it is legal to randomly bump I and J .

After tests:

OK I switched J=I=10240; and set up for(int it=0;it<50;it++) . I also used omp_get_wtime() for time measurments. Below is full diff file.

Here are the results:

Serial time=58.982189
Omp time=9.158118

It was perfromed on a 6-phys/12-logical core machine. Now the results are as expected. Your example problem was way too small for OpenMP to be efficient up to the point the overhead took longer than the calculations.

Diff:

luk32:~/projects/tests$ diff laplace.orig.cpp laplace.cpp
88,89c88,89
<   const int I=129;
<   const int J=129;
---
>   const int I=10000;
>   const int J=10000;
102,103c102,103
<   clock_t start=clock();
<   for(int it=0;it<10000;it++)
---
>   double start=omp_get_wtime();
>   for(int it=0;it<50;it++)
105,106c105,106
<   clock_t end=clock();
<   printf("Serial time=%f\n",(float)(end-start)/CLOCKS_PER_SEC);
---
>   double end=omp_get_wtime();
>   printf("Serial time=%f\n",(float)(end-start));
110,111c110,111
<   clock_t start2=clock();
<   for(int it2=0;it2<10000;it2++)
---
>   double start2=omp_get_wtime();
>   for(int it2=0;it2<50;it2++)
113,114c113,114
<   clock_t end2=clock();
<   printf("Omp time=%f\n",(float)(end2-start2)/CLOCKS_PER_SEC);
---
>   double end2=omp_get_wtime();
>   printf("Omp time=%f\n",(float)(end2-start2));

EDIT: I just bolded the main problem so anyone who comes across this will focus automatically on it.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM