[英]Parallel programming in c++ with openmp
這是我第一次嘗試並行化我的代碼。 這種嘗試似乎導致了“競賽”,並且代碼產生了無意義的值。 是否可以以簡單的方式並行化我的代碼? 我知道代碼塊很長,對此感到抱歉。 我的所有變量都在您在此處看到的代碼之前聲明。 真的很感謝您的幫助!
int numthreads=2;
omp_set_num_threads(numthreads);
#pragma omp parallel for
for (int t = 1; t <= tmax; ++t) {
iter=0;
while(iter<5){
switch(iter){
case 1:
for(int j=0;j<Nx+1;++j){
k_1(Nx + 1, j) = k_1(Nx - 1, j);
k_1(0, j) = k_1(2, j);
k_1(j, Ny + 1) = k_1(j, Ny - 1);
k_1(j, 0) = C_new(j, 2);
}
k_0=a2*dt*k_1;
break;
case 2:
for(int j=0;j<Nx+1;++j){
k_2(Nx + 1, j) = k_2(Nx - 1, j);
k_2(0, j) = k_2(2, j);
k_2(j, Ny + 1) = k_2(j, Ny - 1);
k_2(j, 0) = C_new(j, 2);
}
k_0=a3*dt*k_2;
break;
case 3:
for(int j=0;j<Nx+1;++j){
k_3(Nx + 1, j) = k_3(Nx - 1, j);
k_3(0, j) = k_3(2, j);
k_3(j, Ny + 1) = k_3(j, Ny - 1);
k_3(j, 0) = k_3(j, 2);
}
k_0=a4*dt*k_3;
break;
case 4:
k_0.fill(0);
break;
}
for (int i = 1; i <= Nx; ++i) {
for (int j = 1; j <= Ny; ++j) {
// Computing ghost nodes values around (i,j)
//Order parameter
Psi_cipjp = (psi_old(i + 1, j + 1) + psi_old(i, j + 1) + psi_old(i, j) + psi_old(i + 1, j)) / 4;
Psi_cipjm = (psi_old(i + 1, j) + psi_old(i, j) + psi_old(i, j - 1) + psi_old(i + 1, j - 1)) / 4;
Psi_cimjp = (psi_old(i, j + 1) + psi_old(i - 1, j + 1) + psi_old(i - 1, j) + psi_old(i, j)) / 4;
Psi_cimjm = (psi_old(i, j) + psi_old(i - 1, j) + psi_old(i - 1, j - 1) + psi_old(i, j - 1)) / 4;
// UPDATING THE ORDER PARAMETER PSI !//
// Calculating right edge flux JR
DERX = (psi_old(i + 1, j) - psi_old(i, j)) / dx;
DERY = (Psi_cipjp - Psi_cipjm) / dx;
// Setting anisotropy parameters
aniso(DERX, DERY, a_s, a_12, eps, epsilon);
JR = Atheta * (Atheta * DERX + Aptheta * DERY);
// Calculating left edge flux JL
DERX = (psi_old(i, j) - psi_old(i - 1, j)) / dx;
DERY = (Psi_cimjp - Psi_cimjm) / dx;
// Setting anisotropy parameters
aniso(DERX, DERY, a_s, a_12, eps, epsilon);
JL = Atheta * (Atheta * DERX + Aptheta * DERY);
// Calculating top edge flux JT
DERY = (psi_old(i, j + 1) - psi_old(i, j)) / dx;
DERX = (Psi_cipjp - Psi_cimjp) / dx;
// Setting anisotropy parameters
aniso(DERX, DERY, a_s, a_12, eps, epsilon);
JT = Atheta * (Atheta * DERY - Aptheta * DERX);
// Calculating bottom edge flux JB
DERY = (psi_old(i, j) - psi_old(i, j - 1)) / dx;
DERX = (Psi_cipjm - Psi_cimjm) / dx;
// Setting anisotropy parameters
aniso(DERX, DERY, a_s, a_12, eps, epsilon);
JB = Atheta * (Atheta * DERY - Aptheta * DERX);
// Update psi
M = (1 - C_old(i, j)) * Ma + C_old(i, j) * Mb;
g = pow(psi_old(i, j), 2) * pow((1 - psi_old(i, j)), 2);
gprime = 2 * psi_old(i, j) * (1 - psi_old(i, j)) * (1 - 2 * psi_old(i, j));
HA = Wa * gprime + 30 * g * H_A * (1 /( T_old(i, j)+k_0(i,j)) - 1 / Tm_A);
HB = Wb * gprime + 30 * g * H_B * (1 / (T_old(i, j)+k_0(i,j)) - 1 / Tm_B);
H = (1 - C_old(i, j)) * HA + C_old(i, j) * HB;
rand=distr(gen);
Noise=M*A_noise*rand*16*g*((1-C_old(i,j))*HA+C_old(i,j)*HB);
dpsi=(dt / dx) * ((JR - JL + JT - JB) * M * Epsilon2 - dx * M * H-dx*Noise);
psi_new(i, j) = psi_old(i, j) + dpsi;
dpsi_dt(i, j) = dpsi/ dt;
//std::cout<<"dpsi_dt="<<dpsi_dt(i,j)<<std::endl;
// UPDATING THE CONCENTRATION FIELD ! //
//Evaluating field values on finite volume boundary
dpsi_dt_R = (dpsi_dt(i + 1, j) + dpsi_dt(i, j)) / 2;
dpsi_dt_L = (dpsi_dt(i, j) + dpsi_dt(i - 1, j)) / 2;
dpsi_dt_T = (dpsi_dt(i, j + 1) + dpsi_dt(i, j)) / 2;
dpsi_dt_B = (dpsi_dt(i, j) + dpsi_dt(i, j - 1)) / 2;
psi_R = (psi_old(i + 1, j) + psi_old(i, j)) / 2;
psi_L = (psi_old(i, j) + psi_old(i - 1, j)) / 2;
psi_T = (psi_old(i, j + 1) + psi_old(i, j)) / 2;
psi_B = (psi_old(i, j) + psi_old(i, j - 1)) / 2;
C_R = (C_old(i + 1, j) + C_old(i, j)) / 2;
C_L = (C_old(i, j) + C_old(i - 1, j)) / 2;
C_T = (C_old(i, j + 1) + C_old(i, j)) / 2;
C_B = (C_old(i, j) + C_old(i, j - 1)) / 2;
T_R = (T_old(i + 1, j)+k_0(i+1,j) + T_old(i, j)+k_0(i,j)) / 2;
T_L = (T_old(i, j)+k_0(i,j) + T_old(i - 1, j)+k_0(i-1,j)) / 2;
T_T = (T_old(i, j + 1)+k_0(i,j+1) + T_old(i, j)+k_0(i,j)) / 2;
T_B = (T_old(i, j)+k_0(i,j) + T_old(i, j - 1)+k_0(i,j-1)) / 2;
Psi_cipjp = (psi_old(i + 1, j + 1) + psi_old(i, j + 1) + psi_old(i, j) + psi_old(i + 1, j)) / 4;
Psi_cipjm = (psi_old(i + 1, j) + psi_old(i, j) + psi_old(i, j - 1) + psi_old(i + 1, j - 1)) / 4;
Psi_cimjp = (psi_old(i, j + 1) + psi_old(i - 1, j + 1) + psi_old(i - 1, j) + psi_old(i, j)) / 4;
Psi_cimjm = (psi_old(i, j) + psi_old(i - 1, j) + psi_old(i - 1, j - 1) + psi_old(i, j - 1)) / 4;
// Calculating right edge flux for anti-trapping term
g = pow(psi_R, 2) * pow((1 - psi_R), 2);
gprime = 2 * psi_R * (1 - psi_R) * (1 - 2 * psi_R);
HA = Wa * gprime + 30 * g * H_A * (1 / T_R - 1 / Tm_A);
HB = Wb * gprime + 30 * g * H_B * (1 / T_R - 1 / Tm_B);
DERX = (psi_old(i + 1, j) - psi_old(i, j)) / dx;
DERY = (Psi_cipjp - Psi_cipjm) / dx;
DERX_C = (C_old(i + 1, j) - C_old(i, j)) / dx;
Mag2 = pow(DERX, 2) + pow(DERY, 2);
JR = DERX_C + Vm / R * C_R * (1 - C_R) * (HB - HA) * DERX;
JR_a = 0;
if (Mag2 > eps) {
JR_a = a * lambda * (1 - partition) * 2 * C_R / (1 + partition - (1 - partition) * psi_R) * dpsi_dt_R * DERX / sqrt(Mag2);
}
// Calculating left edge flux for anti-trapping term
g = pow(psi_L, 2) * pow((1 - psi_L), 2);
gprime = 2 * psi_L * (1 - psi_L) * (1 - 2 * psi_L);
HA = Wa * gprime + 30 * g * H_A * (1 / T_L - 1 / Tm_A);
HB = Wb * gprime + 30 * g * H_B * (1 / T_L - 1 / Tm_B);
DERX = (psi_old(i, j) - psi_old(i - 1, j)) / dx;
DERY = (Psi_cimjp - Psi_cimjm) / dx;
DERX_C = (C_old(i, j) - C_old(i - 1, j)) / dx;
Mag2 = pow(DERX, 2) + pow(DERY, 2);
JL = DERX_C + Vm / R * C_L * (1 - C_L) * (HB - HA) * DERX;
JL_a = 0;
if (Mag2 > eps) {
JL_a = a * lambda * (1 - partition) * 2 * C_L / (1 + partition - (1 - partition) * psi_L) * dpsi_dt_L * DERX / sqrt(Mag2);
}
// Calculating top edge flux for anti-trapping term
g = pow(psi_T, 2) * pow((1 - psi_T), 2);
gprime = 2 * psi_T * (1 - psi_T) * (1 - 2 * psi_T);
HA = Wa * gprime + 30 * g * H_A * (1 / T_T - 1 / Tm_A);
HB = Wb * gprime + 30 * g * H_B * (1 / T_T - 1 / Tm_B);
DERY = (psi_old(i, j + 1) - psi_old(i, j)) / dx;
DERX = (Psi_cipjp - Psi_cimjp) / dx;
DERY_C = (C_old(i, j + 1) - C_old(i, j)) / dx;
Mag2 = pow(DERX, 2) + pow(DERY, 2);
JT = DERY_C + Vm / R * C_T * (1 - C_T) * (HB - HA) * DERY;
JT_a = 0;
if (Mag2 > eps) {
JT_a = a * lambda * (1 - partition) * 2 * C_T / (1 + partition - (1 - partition) * psi_T) * dpsi_dt_T * DERY / sqrt(Mag2);
}
// Calculating bottom edge flux for anti-trapping term
g = pow(psi_B, 2) * pow((1 - psi_B), 2);
gprime = 2 * psi_B * (1 - psi_B) * (1 - 2 * psi_B);
HA = Wa * gprime + 30 * g * H_A * (1 / T_B - 1 / Tm_A);
HB = Wb * gprime + 30 * g * H_B * (1 / T_B - 1 / Tm_B);
DERY = (psi_old(i, j) - psi_old(i, j - 1)) / dx;
DERX = (Psi_cipjm - Psi_cimjm) / dx;
DERY_C = (C_old(i, j) - C_old(i, j - 1)) / dx;
Mag2 = pow(DERX, 2) + pow(DERY, 2);
JB = DERY_C + Vm / R * C_B * (1 - C_B) * (HB - HA) * DERY;
JB_a = 0;
if (Mag2 > eps) {
JB_a = a * lambda * (1 - partition) * 2 * C_B / (1 + partition - (1 - partition) * psi_B) * dpsi_dt_B * DERY / sqrt(Mag2);
}
// Update the concentration C
DR = D_s + pow(psi_R, 3) * (10 - 15 * psi_R + 6 * pow(psi_R, 2)) * (D_l - D_s);
DL = D_s + pow(psi_L, 3) * (10 - 15 * psi_L + 6 * pow(psi_L, 2)) * (D_l - D_s);
DT = D_s + pow(psi_T, 3) * (10 - 15 * psi_T + 6 * pow(psi_T, 2)) * (D_l - D_s);
DB = D_s + pow(psi_B, 3) * (10 - 15 * psi_B + 6 * pow(psi_B, 2)) * (D_l - D_s);
C_new(i, j) = C_old(i, j) + dt / dx * (DR * (JR + JR_a) - DL * (JL + JL_a) + DT * (JT + JT_a) - DB * (JB + JB_a));
}
}
for(int j=0;j<Nx+1;++j){
C_new(Nx + 1, j) = C_new(Nx - 1, j);
C_new(0, j) = C_new(2, j);
C_new(j, Ny + 1) = C_new(j, Ny - 1);
C_new(j, 0) = C_new(j, 2);
psi_new(Nx + 1, j) = psi_new(Nx - 1, j);
psi_new(0, j) = psi_new(2, j);
psi_new(j, Ny + 1) = psi_new(j, Ny - 1);
psi_new(j, 0) = psi_new(j, 2);
}
//UPDATING THE TEMPERATURE EQUATION!//
//Finte volume with explicit Euler
// KR = (1 - C_R) * K_A + C_R * K_B;
// KL = (1 - C_L) * K_A + C_L * K_B;
// KT = (1 - C_T) * K_A + C_T * K_B;
// KB = (1 - C_B) * K_A + C_B * K_B;
//
// //calculating right edge flux for the temperature field
//
// DERX_T = (T_old(i + 1, j) - T_old(i, j)) / dx;
// JR = KR * DERX_T;
//
// //calculating left edge flux for the temperature field
//
// DERX_T = (T_old(i, j) - T_old(i - 1, j)) / dx;
// JL = KL * DERX_T;
//
// //calculating top edge flux for the temperature field
//
// DERY_T = (T_old(i, j + 1) - T_old(i, j)) / dx;
// JT = KT * DERY_T;
//
// //calculating bottom edge flux for the temperature field
//
// DERY_T = (T_old(i, j) - T_old(i, j - 1)) / dx;
// JB = KB * DERY_T;
//
// cp = (1 - C_old(i, j)) * cp_A + C_old(i, j) * cp_B;
// Htilde = (1 - C_old(i, j)) * H_A + C_old(i, j) * H_B;
// g = pow(psi_old(i, j), 2) * pow((1 - psi_old(i, j)), 2);
//
//
// T_new(i,j) = dt / (cp * dx * dx) * (dx * (JR - JL + JT - JB) - dx * dx * 30 * g * Htilde * dpsi_dt(i, j)) + T_old(i, j);
//Finite difference
if(iter<4){
for (int i = 1; i <= Nx; ++i) {
for (int j = 1; j <= Ny; ++j) {
K=(1-C_new(i,j))*K_A+C_new(i,j)*K_B;
DERX_C=(C_new(i+1,j)-C_new(i,j))/dx;
DERY_C=(C_new(i,j+1)-C_new(i,j))/dx;
DERX_T=(T_old(i+1,j)+k_0(i+1,j)-T_old(i,j)+k_0(i,j))/dx;
DERY_T=(T_old(i,j+1)+k_0(i,j+1)-T_old(i,j)+k_0(i,j))/dx;
cp = (1 - C_new(i, j)) * cp_A + C_new(i, j) * cp_B;
Htilde = (1 - C_new(i, j)) * H_A + C_new(i, j) * H_B;
g = pow(psi_new(i, j), 2) * pow((1 - psi_new(i, j)), 2);
Nabla=1/pow(dx,2)*(0.5*(T_old(i+1,j)+k_0(i+1,j)+T_old(i-1,j)+k_0(i-1,j)+T_old(i,j+1)+k_0(i,j+1)+T_old(i,j-1)+k_0(i,j-1))+0.25*(T_old(i+1,j+1)+k_0(i+1,j+1)+T_old(i+1,j-1)+k_0(i+1,j-1)
+T_old(i-1,j+1)+k_0(i-1,j+1)+T_old(i-1,j-1)+k_0(i-1,j+1))-3*T_old(i,j)+k_0(i,j));
if(iter==0){
k1=1/cp*((K_B-K_A)*(DERX_C*DERX_T+DERY_C*DERY_T)+K*Nabla-30*g*Htilde*dpsi_dt(i,j));
k_1(i,j)=k1;
}else
if(iter==1){
k2=1/cp*((K_B-K_A)*(DERX_C*DERX_T+DERY_C*DERY_T)+K*Nabla-30*g*Htilde*dpsi_dt(i,j));
k_2(i,j)=k2;
}else
if(iter==2){
k3=1/cp*((K_B-K_A)*(DERX_C*DERX_T+DERY_C*DERY_T)+K*Nabla-30*g*Htilde*dpsi_dt(i,j));
k_3(i,j)=k3;
}else
if(iter==3){
k4=1/cp*((K_B-K_A)*(DERX_C*DERX_T+DERY_C*DERY_T)+K*Nabla-30*g*Htilde*dpsi_dt(i,j));
k_4(i,j)=k4;
//std::cout<<"k_1="<<k_1<<"\n"<<"k_2="<<k_2<<"\n"<<"k_3="<<k_3<<"\n"<<"k_4="<<k_4<<std::endl;
T_new(i,j)=T_old(i,j)+dt*(b1*k_1(i,j)+b2*k_2(i,j)+b3*k_3(i,j)+b4*k_4(i,j));
}
}
}
}
iter++;
}
“有可能以一種簡單的方式並行化我的代碼嗎?”
您的代碼並不簡單,因此如果不花費太多時間,任何人都無法回答。
“我的所有變量都在您在此處看到的代碼之前聲明。”
我發現使用OpenMP要做相反的事情是最容易的。 每當OpenMP中有並行代碼段時,您實際上都需要考慮同時發生的所有情況。 因此,如果聲明一個變量,則該變量現在有n個副本。 如果您嘗試寫入在並行部分之外聲明的變量,則n種不同的嘗試都試圖一次寫入該資源。
如果您想使OpenMP變得簡單,請使線程本地化盡可能多。 (也就是在該循環內聲明將在循環中使用的所有變量)。 如果這不符合您的需要,請研究如何使用OpenMP的reduction
子句創建變量的本地副本,這些副本將在並行部分的末尾進行組合(通過加法,乘法等)以創建最終值,即代表所有線程的結果。 作為最后的選擇,請在並行部分中引用外部資源,但是您可能需要在代碼中添加critical
或atomic
注釋,以確保一次只有一個線程在執行該部分代碼。
要處理大型數組,可以更容易地將中間結果存儲在為並行區域內的每個線程分配的子數組中,然后在每個子進程的末尾擁有一個最終的單線程代碼段。數組和將結果適當存儲回大型數組的句柄。
與所有其他內容一樣,請確保您使用某種計時機制來確保您的更改實際上在加快速度! 我建議std :: chrono :: steady_clock,只要您要計時的代碼區域要花幾毫秒才能運行。
由於您遺漏了關鍵信息(例如算法的功能和所有變量的含義),因此我無法真正為您提供幫助,但是我可以為您提供一般的並行化建議。
首先,您需要使用性能分析器,找出代碼中最耗時的部分。 我打賭這是for (int i = 1; i <= Nx; ++i)
for (int j = 1; j <= Ny; ++j)
部分,但我們需要確定。 假設您進行了分析,我是對的。 下一步是什么?
現在,您已在外部范圍中聲明了所有變量。 你就是做不到。 您修改的每個變量/指針都需要在並行函數/循環的范圍內聲明。 假設我對for循環是關鍵部分是正確的,我是說您的代碼應該更像這樣:
for (int i = 1; i <= Nx; ++i) {
for (int j = 1; j <= Ny; ++j) {
// Move all the declarations to local scope-- if this loop runs in parallel, each loop then has it's own variables to work with.
int Psi_cipjp = (psi_old(i + 1, j + 1) + psi_old(i, j + 1) + psi_old(i, j) + psi_old(i + 1, j)) / 4;
int Psi_cipjm = (psi_old(i + 1, j) + psi_old(i, j) + psi_old(i, j - 1) + psi_old(i + 1, j - 1)) / 4;
int Psi_cimjp = (psi_old(i, j + 1) + psi_old(i - 1, j + 1) + psi_old(i - 1, j) + psi_old(i, j)) / 4;
int Psi_cimjm = (psi_old(i, j) + psi_old(i - 1, j) + psi_old(i - 1, j - 1) + psi_old(i, j - 1)) / 4;
int DERX = (psi_old(i + 1, j) - psi_old(i, j)) / dx;
int DERY = (Psi_cipjp - Psi_cipjm) / dx;
/* and so on... */
}
}
但是您的算法似乎是一個復雜的算法,因此如果您發現必須認真思考一下實現以使其並行化,我不會感到驚訝。 我建議您在解決此問題之前嘗試一些更簡單的並行化問題。
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.