![](/img/trans.png)
[英]Serial Execution faster than Parallel Execution with one thread of OpenMP
[英]openmp serial faster than parallel
代码是并行的,但是我不知道为什么它比我的串行要慢,当我将线程添加到7到10时,程序也会变慢。
我一直在试图找出问题所在,但对我来说却很难
我将for循环设为并行,但似乎无法正常工作。 运行代码时,我没有收到任何错误。
#include <stdio.h>
#include <math.h>
#include <omp.h>
#include <stdlib.h>
int main(int argc, char *argv[])
{
int m;
int n;
double tol;// = 0.0001;
double tstart, tstop;
int i, j, iter, nthreads;
m = atoi(argv[1]);
n = atoi(argv[2]);
tol = atof(argv[3]);
double t[m+2][n+2], tnew[m+1][n+1], diff, difmax,priv_difmax;
printf("%d %d %lf\n",m,n,tol);
printf("Enter the number of threads (max 10) ");
scanf("%d",&nthreads);
omp_set_num_threads(nthreads);
tstart = omp_get_wtime ();
//** initialise temperature array*
#pragma omp parallel for schedule(static)\
default(shared) private(i,j)
for (i=0; i <= m+1; i++) {
for (j=0; j <= n+1; j++) {
t[i][j] = 30.0;
}
}
//*** fix boundary conditions***
for (i=1; i <= m; i++) {
t[i][0] = 20.0;
t[i][n+1] = 100.0;
}
for (j=1; j <= n; j++) {
t[0][j] = 10.0;
t[m+1][j] = 140.0;
}
//** main loop**
iter = 0;
difmax = 1000000.0;
while (difmax > tol) {
iter++;
// **update temperature for next iteration**
#pragma omp parallel for schedule(static) \
default(shared) private(i,j)
for (i=1; i <= m; i++) {
for (j=1; j <= n; j++) {
tnew[i][j] = (t[i-1][j]+t[i+1][j]+t[i][j-1]+t[i][j+1])/4.0;
}
}
// **work out maximum difference between old and new temperatures**
difmax = 0.0;
#pragma omp parallel default(shared) private(i, j, diff, priv_difmax)
{
priv_difmax = 0.0;
#pragma omp for schedule(static)
for (i=1; i <= m; i++) {
for (j=1; j <= n; j++) {
diff = fabs(tnew[i][j]-t[i][j]);
if (diff > priv_difmax) {
priv_difmax = diff;
}
//** copy new to old temperatures**
t[i][j] = tnew[i][j];
}
#pragma omp critical
if (priv_difmax > difmax){
difmax = priv_difmax;
}
}
}
}
tstop = omp_get_wtime ();
// print results
printf("iter = %d difmax = %9.11lf", iter, difmax);
for (i=0; i <= m+1; i++) {
printf("\n");
for (j=0; j <= n+1; j++) {
printf("%3.5lf ", t[i][j]);
}
}
printf("\n");
tstop = omp_get_wtime ();
printf("time taken is %4.3lf\n", (tstop-tstart));
printf("\n");
}
除了以下代码外,我看不到明显的问题:
#pragma omp parallel default(shared) private(i, j, diff, priv_difmax)
{
priv_difmax = 0.0;
#pragma omp for schedule(static)
for (i=1; i <= m; i++) {
for (j=1; j <= n; j++) {
diff = fabs(tnew[i][j]-t[i][j]);
if (diff > priv_difmax) {
priv_difmax = diff;
}
//** copy new to old temperatures**
t[i][j] = tnew[i][j];
}
#pragma omp critical
if (priv_difmax > difmax){
difmax = priv_difmax;
}
}
}
减少部分,将priv_difmax
复制到difmax
,应该移出循环,以便线程仅通过critical
部分一次,而不是在外循环的每次迭代中都通过。
#pragma omp parallel default(shared) private(i, j, diff, priv_difmax)
{
priv_difmax = 0.0;
#pragma omp for schedule(static) nowait //no need to wait after the loop
for (i=1; i <= m; i++) {
for (j=1; j <= n; j++) {
diff = fabs(tnew[i][j]-t[i][j]);
if (diff > priv_difmax) {
priv_difmax = diff;
}
//** copy new to old temperatures**
t[i][j] = tnew[i][j];
}
}
// Finish the loop first, then update difmax
#pragma omp critical
if (priv_difmax > difmax){
difmax = priv_difmax;
}
} //Implicit barrier
现在,并行化具有开销成本,并且仅对于较大的m和n值,可以预期加速。 您正在考虑的问题可能太小。 减少开销的方法是合并两个parallel
结构,这样就不必产生两次线程池。 甚至更好的是,将while循环放入parallel
构造中,这样我们只需要在每次迭代时同步现有线程,而不必创建和销毁它们:
difmax=1000000.0;
#pragma omp parallel default(shared) private(i, j, diff, priv_difmax)
while (difmax > tol) {
// have one thread reset difmax and increment iter
#pragma omp single nowait
iter++,difmax=0.0;
// loop to update tnew - distributed among threads
#pragma omp parallel for schedule(static) \
default(shared) private(i,j)
for (i=1; i <= m; i++) {
for (j=1; j <= n; j++) {
tnew[i][j] = (t[i-1][j]+t[i+1][j]+t[i][j-1]+t[i][j+1])/4.0;
}
} //implicit barrier here
// each thread resets its private difmax
priv_difmax=0.0;
// loop to compute difmax - distributed among threads
#pragma omp for schedule(static) nowait
for (i=1; i <= m; i++) {
for (j=1; j <= n; j++) {
diff = fabs(tnew[i][j]-t[i][j]);
if (diff > priv_difmax) {
priv_difmax = diff;
}
//** copy new to old temperatures**
t[i][j] = tnew[i][j];
}
}
// each thread now updates difmax if needed, one at a time
#pragma omp critical
if (priv_difmax > difmax){
difmax = priv_difmax;
}
// put a barrier here to make sure that diffmax have been updated
// before any thread tests the condition for next iteration of the
// while-loop condition
#pragma omp barrier
}
比较代码以串行和并行方式运行的最好方法是在有和没有OpenMP支持的情况下进行编译(例如,使用gcc,使用和不使用-fopenmp编译器和链接器标志进行编译)。 这将有助于指出问题出在实际上是并行化还是原始串行代码与“并行就绪”版本之间的其他修改。
这个想法是要知道从原始的串行代码到并行代码(没有并行支持而编译)到并行代码(与OpenMP编译)之间浪费了时间。
需要使用一些预处理头,因为在没有OpenMP支持的情况下,编译器将无法识别omp_get_thread_num()
类的函数。 omp_get_wtime()
也不应使用; 由于所有时间释放都是在并行区域之外完成的,因此无需使用该特定函数,并且对time()
的调用将是准确的(这需要#include <time.h>
)。
// This part is necessary for the code to run whether it is compiled or not with OpenMP
#ifdef _OPENMP
#include <omp.h>
#else
# ifndef _ESCAPE_OMPENMP
#define omp_get_num_threads() 1
#define omp_get_thread_num() 0
#define omp_get_max_threads() 0
#define _ESCAPE_OMPENMP
#endif
#endif
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.