为什么我的OpenMP C ++代码比串行代码慢？

Question

#include <iostream>
#include <iomanip>
#include <fstream> 
#include <sstream>
#include <string>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>
using namespace std;


void output(float a[], float X[], float Y[], int I, int J)
{
  ofstream ft;
  int i;

  ft.open("flow.dat");
  ft<<"variables=\"x\",\"y\",\"a\""<<"\n"
    <<"zone f=point"<<"\n"
    <<"I="<<I<<",J="<<J<<"\n"
    <<endl;

  for(int i=0;i<I*J;i++)
    {
    ft<<setiosflags(ios::scientific)
      <<X[i]<<" "<<Y[i]<<" "<<a[i]<<endl;
    }

  ft.close();

}

void set(float a[], float X[], float Y[], int I, int J, float hx, float hy)
{
  for(int j=0;j<J;j++)
    for(int i=0;i<I;i++)
      {
        int iC=j*I+i;
        X[iC]=i*hx;
        Y[iC]=j*hy;
        a[iC]=0.0;
        if(j==J-1) a[iC]=1.0;
      }
}

void difference_serial(float a[],  int I, int J, const float hx, const float hy)
{
  const float aC=(hx*hx+hy*hy)*2;
  const float aX=hy*hy;
  const float aY=hx*hx;
  for(int j=1;j<J-1;j++)
    for(int i=1;i<I-1;i++)
      {
        int iC=j*I+i;
        int iL=iC-1;
        int iR=iC+1;
        int iU=iC+I;
        int iD=iC-I;
        a[iC]=(aX*(a[iL]+a[iR])+aY*(a[iU]+a[iD]))/aC;
      }


}

void difference_omp(float a[],  int I, int J, const float hx, const float hy)
{
  const float aC=(hx*hx+hy*hy)*2;
  const float aX=hy*hy;
  const float aY=hx*hx;

  int i,j,iC,iL,iR,iU,iD;
#pragma omp parallel for private(i,j,iC,iL,iR,iU,iD) shared(a,I,J) schedule(dynamic) 
  for( j=1;j<J-1;j++)
    for( i=1;i<I-1;i++)
      {
        iC=j*I+i;
        iL=iC-1;
        iR=iC+1;
        iU=iC+I;
        iD=iC-I;
        a[iC]=(aX*(a[iL]+a[iR])+aY*(a[iU]+a[iD]))/aC;
      }
}

int main()
{
  const int I=129;
  const int J=129;
  const int N=I*J;
  const float hx=1.0/(I-1);
  const float hy=1.0/(J-1);

  float *a=new float[N];
  float *X=new float[N];
  float *Y=new float[N];

  //set the grid and flow
  set(a,X,Y,I,J,hx,hy);

  //iteation
  clock_t start=clock();
  for(int it=0;it<10000;it++)
    difference_serial(a,I,J,hx,hy);
  clock_t end=clock();
  printf("Serial time=%f\n",(float)(end-start)/CLOCKS_PER_SEC);


  set(a,X,Y,I,J,hx,hy);
  clock_t start2=clock();
  for(int it2=0;it2<10000;it2++)
    difference_omp(a,I,J,hx,hy);
  clock_t end2=clock();
  printf("Omp time=%f\n",(float)(end2-start2)/CLOCKS_PER_SEC);

  //output
  output(a,X,Y,I,J);

  //free memory
  delete[] a;
  delete[] X;
  delete[] Y;
}

I write a piece of code to solve a very simple Laplace equation in two dimensions. 我编写了一段代码，以二维方式求解非常简单的Laplace方程。 Try to compare the serial code and OpenMP code 尝试比较串行代码和OpenMP代码

I tried to compile the code with g++ tmp.cpp -fopenmp 我试图用g ++ tmp.cpp -fopenmp编译代码

and get the very strange result output: Serial time=1.620000 Omp time=9.820000 并得到非常奇怪的结果输出：串行时间= 1.620000 Omp时间= 9.820000

Is there anyone can help me to figure out what's the reason behind this and how to correct the OpenMP code. 有没有人可以帮助我弄清楚这是什么原因以及如何更正OpenMP代码。

Answer 1

I ran into funny results. 我遇到了有趣的结果。

luk32:~/projects/tests$ g++ -fopenmp -lgomp ./laplace.cpp 
luk32:~/projects/tests$ ./a.out 
Omp time=13.000000
Serial time=3.000000
luk32:~/projects/tests$ g++ -O3 -fopenmp -lgomp ./laplace.cpp 
luk32:~/projects/tests$ ./a.out 
Omp time=31.000000
Serial time=1.000000

So with O3 the time worsened for OpenMP and dropped forthe serial version. 因此，使用O3时，OpenMP的时间变得更糟，而串行版本的时间却减少了。 My guess is that the problem instance is so small that the actual overhead from invoking parallel region is manifesting here. 我的猜测是问题实例是如此之小，以至于在这里显示了调用并行区域的实际开销。

You are trying to parallelize something that is taking 1.5s / 10k = 0.15 millisecond on your PC. 您正在尝试并行化PC上耗时1.5s / 10k = 0.15毫秒的内容。 Initalizing thread pool and scheduling has its overhead especially with schedule(dynamic) 初始化线程池和调度有其开销，尤其是对于schedule(dynamic)

I will try to do some testing to confirm. 我将尝试进行一些测试以确认。 Not sure if it is legal to randomly bump I and J . 不知道随机颠簸I和J是否合法。

After tests: 经过测试：

OK I switched J=I=10240; OK，我切换了J=I=10240; and set up for(int it=0;it<50;it++) . 并设置for(int it=0;it<50;it++) 。 I also used omp_get_wtime() for time measurments. 我还使用omp_get_wtime()进行时间测量。 Below is full diff file. 以下是完整的差异文件。

Here are the results: 结果如下：

Serial time=58.982189
Omp time=9.158118

It was perfromed on a 6-phys/12-logical core machine. 它是在6物理/ 12逻辑核心计算机上执行的。 Now the results are as expected. 现在结果与预期的一样。 Your example problem was way too small for OpenMP to be efficient up to the point the overhead took longer than the calculations. 您的示例问题对于OpenMP来说太小了，以至于效率不足，直到开销花了比计算更长的时间。

Diff: 差异：

luk32:~/projects/tests$ diff laplace.orig.cpp laplace.cpp
88,89c88,89
<   const int I=129;
<   const int J=129;
---
>   const int I=10000;
>   const int J=10000;
102,103c102,103
<   clock_t start=clock();
<   for(int it=0;it<10000;it++)
---
>   double start=omp_get_wtime();
>   for(int it=0;it<50;it++)
105,106c105,106
<   clock_t end=clock();
<   printf("Serial time=%f\n",(float)(end-start)/CLOCKS_PER_SEC);
---
>   double end=omp_get_wtime();
>   printf("Serial time=%f\n",(float)(end-start));
110,111c110,111
<   clock_t start2=clock();
<   for(int it2=0;it2<10000;it2++)
---
>   double start2=omp_get_wtime();
>   for(int it2=0;it2<50;it2++)
113,114c113,114
<   clock_t end2=clock();
<   printf("Omp time=%f\n",(float)(end2-start2)/CLOCKS_PER_SEC);
---
>   double end2=omp_get_wtime();
>   printf("Omp time=%f\n",(float)(end2-start2));

EDIT: I just bolded the main problem so anyone who comes across this will focus automatically on it. 编辑：我只是加粗了主要问题，所以遇到这个问题的任何人都会自动关注它。

为什么我的OpenMP C ++代码比串行代码慢？

问题描述

1 个解决方案

解决方案1
7 2013-03-04 16:56:57

为什么我的OpenMP C ++代码比串行代码慢？

问题描述

1 个解决方案

解决方案1 7 2013-03-04 16:56:57

解决方案1
7 2013-03-04 16:56:57