![](/img/trans.png)
[英]Serial Execution faster than Parallel Execution with one thread of OpenMP
[英]openmp serial faster than parallel
代碼是並行的,但是我不知道為什么它比我的串行要慢,當我將線程添加到7到10時,程序也會變慢。
我一直在試圖找出問題所在,但對我來說卻很難
我將for循環設為並行,但似乎無法正常工作。 運行代碼時,我沒有收到任何錯誤。
#include <stdio.h>
#include <math.h>
#include <omp.h>
#include <stdlib.h>
int main(int argc, char *argv[])
{
int m;
int n;
double tol;// = 0.0001;
double tstart, tstop;
int i, j, iter, nthreads;
m = atoi(argv[1]);
n = atoi(argv[2]);
tol = atof(argv[3]);
double t[m+2][n+2], tnew[m+1][n+1], diff, difmax,priv_difmax;
printf("%d %d %lf\n",m,n,tol);
printf("Enter the number of threads (max 10) ");
scanf("%d",&nthreads);
omp_set_num_threads(nthreads);
tstart = omp_get_wtime ();
//** initialise temperature array*
#pragma omp parallel for schedule(static)\
default(shared) private(i,j)
for (i=0; i <= m+1; i++) {
for (j=0; j <= n+1; j++) {
t[i][j] = 30.0;
}
}
//*** fix boundary conditions***
for (i=1; i <= m; i++) {
t[i][0] = 20.0;
t[i][n+1] = 100.0;
}
for (j=1; j <= n; j++) {
t[0][j] = 10.0;
t[m+1][j] = 140.0;
}
//** main loop**
iter = 0;
difmax = 1000000.0;
while (difmax > tol) {
iter++;
// **update temperature for next iteration**
#pragma omp parallel for schedule(static) \
default(shared) private(i,j)
for (i=1; i <= m; i++) {
for (j=1; j <= n; j++) {
tnew[i][j] = (t[i-1][j]+t[i+1][j]+t[i][j-1]+t[i][j+1])/4.0;
}
}
// **work out maximum difference between old and new temperatures**
difmax = 0.0;
#pragma omp parallel default(shared) private(i, j, diff, priv_difmax)
{
priv_difmax = 0.0;
#pragma omp for schedule(static)
for (i=1; i <= m; i++) {
for (j=1; j <= n; j++) {
diff = fabs(tnew[i][j]-t[i][j]);
if (diff > priv_difmax) {
priv_difmax = diff;
}
//** copy new to old temperatures**
t[i][j] = tnew[i][j];
}
#pragma omp critical
if (priv_difmax > difmax){
difmax = priv_difmax;
}
}
}
}
tstop = omp_get_wtime ();
// print results
printf("iter = %d difmax = %9.11lf", iter, difmax);
for (i=0; i <= m+1; i++) {
printf("\n");
for (j=0; j <= n+1; j++) {
printf("%3.5lf ", t[i][j]);
}
}
printf("\n");
tstop = omp_get_wtime ();
printf("time taken is %4.3lf\n", (tstop-tstart));
printf("\n");
}
除了以下代碼外,我看不到明顯的問題:
#pragma omp parallel default(shared) private(i, j, diff, priv_difmax)
{
priv_difmax = 0.0;
#pragma omp for schedule(static)
for (i=1; i <= m; i++) {
for (j=1; j <= n; j++) {
diff = fabs(tnew[i][j]-t[i][j]);
if (diff > priv_difmax) {
priv_difmax = diff;
}
//** copy new to old temperatures**
t[i][j] = tnew[i][j];
}
#pragma omp critical
if (priv_difmax > difmax){
difmax = priv_difmax;
}
}
}
減少部分,將priv_difmax
復制到difmax
,應該移出循環,以便線程僅通過critical
部分一次,而不是在外循環的每次迭代中都通過。
#pragma omp parallel default(shared) private(i, j, diff, priv_difmax)
{
priv_difmax = 0.0;
#pragma omp for schedule(static) nowait //no need to wait after the loop
for (i=1; i <= m; i++) {
for (j=1; j <= n; j++) {
diff = fabs(tnew[i][j]-t[i][j]);
if (diff > priv_difmax) {
priv_difmax = diff;
}
//** copy new to old temperatures**
t[i][j] = tnew[i][j];
}
}
// Finish the loop first, then update difmax
#pragma omp critical
if (priv_difmax > difmax){
difmax = priv_difmax;
}
} //Implicit barrier
現在,並行化具有開銷成本,並且僅對於較大的m和n值,可以預期加速。 您正在考慮的問題可能太小。 減少開銷的方法是合並兩個parallel
結構,這樣就不必產生兩次線程池。 甚至更好的是,將while循環放入parallel
構造中,這樣我們只需要在每次迭代時同步現有線程,而不必創建和銷毀它們:
difmax=1000000.0;
#pragma omp parallel default(shared) private(i, j, diff, priv_difmax)
while (difmax > tol) {
// have one thread reset difmax and increment iter
#pragma omp single nowait
iter++,difmax=0.0;
// loop to update tnew - distributed among threads
#pragma omp parallel for schedule(static) \
default(shared) private(i,j)
for (i=1; i <= m; i++) {
for (j=1; j <= n; j++) {
tnew[i][j] = (t[i-1][j]+t[i+1][j]+t[i][j-1]+t[i][j+1])/4.0;
}
} //implicit barrier here
// each thread resets its private difmax
priv_difmax=0.0;
// loop to compute difmax - distributed among threads
#pragma omp for schedule(static) nowait
for (i=1; i <= m; i++) {
for (j=1; j <= n; j++) {
diff = fabs(tnew[i][j]-t[i][j]);
if (diff > priv_difmax) {
priv_difmax = diff;
}
//** copy new to old temperatures**
t[i][j] = tnew[i][j];
}
}
// each thread now updates difmax if needed, one at a time
#pragma omp critical
if (priv_difmax > difmax){
difmax = priv_difmax;
}
// put a barrier here to make sure that diffmax have been updated
// before any thread tests the condition for next iteration of the
// while-loop condition
#pragma omp barrier
}
比較代碼以串行和並行方式運行的最好方法是在有和沒有OpenMP支持的情況下進行編譯(例如,使用gcc,使用和不使用-fopenmp編譯器和鏈接器標志進行編譯)。 這將有助於指出問題出在實際上是並行化還是原始串行代碼與“並行就緒”版本之間的其他修改。
這個想法是要知道從原始的串行代碼到並行代碼(沒有並行支持而編譯)到並行代碼(與OpenMP編譯)之間浪費了時間。
需要使用一些預處理頭,因為在沒有OpenMP支持的情況下,編譯器將無法識別omp_get_thread_num()
類的函數。 omp_get_wtime()
也不應使用; 由於所有時間釋放都是在並行區域之外完成的,因此無需使用該特定函數,並且對time()
的調用將是准確的(這需要#include <time.h>
)。
// This part is necessary for the code to run whether it is compiled or not with OpenMP
#ifdef _OPENMP
#include <omp.h>
#else
# ifndef _ESCAPE_OMPENMP
#define omp_get_num_threads() 1
#define omp_get_thread_num() 0
#define omp_get_max_threads() 0
#define _ESCAPE_OMPENMP
#endif
#endif
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.