[英]Parallel programming in c++ with openmp
这是我第一次尝试并行化我的代码。 这种尝试似乎导致了“竞赛”,并且代码产生了无意义的值。 是否可以以简单的方式并行化我的代码? 我知道代码块很长,对此感到抱歉。 我的所有变量都在您在此处看到的代码之前声明。 真的很感谢您的帮助!
int numthreads=2;
omp_set_num_threads(numthreads);
#pragma omp parallel for
for (int t = 1; t <= tmax; ++t) {
iter=0;
while(iter<5){
switch(iter){
case 1:
for(int j=0;j<Nx+1;++j){
k_1(Nx + 1, j) = k_1(Nx - 1, j);
k_1(0, j) = k_1(2, j);
k_1(j, Ny + 1) = k_1(j, Ny - 1);
k_1(j, 0) = C_new(j, 2);
}
k_0=a2*dt*k_1;
break;
case 2:
for(int j=0;j<Nx+1;++j){
k_2(Nx + 1, j) = k_2(Nx - 1, j);
k_2(0, j) = k_2(2, j);
k_2(j, Ny + 1) = k_2(j, Ny - 1);
k_2(j, 0) = C_new(j, 2);
}
k_0=a3*dt*k_2;
break;
case 3:
for(int j=0;j<Nx+1;++j){
k_3(Nx + 1, j) = k_3(Nx - 1, j);
k_3(0, j) = k_3(2, j);
k_3(j, Ny + 1) = k_3(j, Ny - 1);
k_3(j, 0) = k_3(j, 2);
}
k_0=a4*dt*k_3;
break;
case 4:
k_0.fill(0);
break;
}
for (int i = 1; i <= Nx; ++i) {
for (int j = 1; j <= Ny; ++j) {
// Computing ghost nodes values around (i,j)
//Order parameter
Psi_cipjp = (psi_old(i + 1, j + 1) + psi_old(i, j + 1) + psi_old(i, j) + psi_old(i + 1, j)) / 4;
Psi_cipjm = (psi_old(i + 1, j) + psi_old(i, j) + psi_old(i, j - 1) + psi_old(i + 1, j - 1)) / 4;
Psi_cimjp = (psi_old(i, j + 1) + psi_old(i - 1, j + 1) + psi_old(i - 1, j) + psi_old(i, j)) / 4;
Psi_cimjm = (psi_old(i, j) + psi_old(i - 1, j) + psi_old(i - 1, j - 1) + psi_old(i, j - 1)) / 4;
// UPDATING THE ORDER PARAMETER PSI !//
// Calculating right edge flux JR
DERX = (psi_old(i + 1, j) - psi_old(i, j)) / dx;
DERY = (Psi_cipjp - Psi_cipjm) / dx;
// Setting anisotropy parameters
aniso(DERX, DERY, a_s, a_12, eps, epsilon);
JR = Atheta * (Atheta * DERX + Aptheta * DERY);
// Calculating left edge flux JL
DERX = (psi_old(i, j) - psi_old(i - 1, j)) / dx;
DERY = (Psi_cimjp - Psi_cimjm) / dx;
// Setting anisotropy parameters
aniso(DERX, DERY, a_s, a_12, eps, epsilon);
JL = Atheta * (Atheta * DERX + Aptheta * DERY);
// Calculating top edge flux JT
DERY = (psi_old(i, j + 1) - psi_old(i, j)) / dx;
DERX = (Psi_cipjp - Psi_cimjp) / dx;
// Setting anisotropy parameters
aniso(DERX, DERY, a_s, a_12, eps, epsilon);
JT = Atheta * (Atheta * DERY - Aptheta * DERX);
// Calculating bottom edge flux JB
DERY = (psi_old(i, j) - psi_old(i, j - 1)) / dx;
DERX = (Psi_cipjm - Psi_cimjm) / dx;
// Setting anisotropy parameters
aniso(DERX, DERY, a_s, a_12, eps, epsilon);
JB = Atheta * (Atheta * DERY - Aptheta * DERX);
// Update psi
M = (1 - C_old(i, j)) * Ma + C_old(i, j) * Mb;
g = pow(psi_old(i, j), 2) * pow((1 - psi_old(i, j)), 2);
gprime = 2 * psi_old(i, j) * (1 - psi_old(i, j)) * (1 - 2 * psi_old(i, j));
HA = Wa * gprime + 30 * g * H_A * (1 /( T_old(i, j)+k_0(i,j)) - 1 / Tm_A);
HB = Wb * gprime + 30 * g * H_B * (1 / (T_old(i, j)+k_0(i,j)) - 1 / Tm_B);
H = (1 - C_old(i, j)) * HA + C_old(i, j) * HB;
rand=distr(gen);
Noise=M*A_noise*rand*16*g*((1-C_old(i,j))*HA+C_old(i,j)*HB);
dpsi=(dt / dx) * ((JR - JL + JT - JB) * M * Epsilon2 - dx * M * H-dx*Noise);
psi_new(i, j) = psi_old(i, j) + dpsi;
dpsi_dt(i, j) = dpsi/ dt;
//std::cout<<"dpsi_dt="<<dpsi_dt(i,j)<<std::endl;
// UPDATING THE CONCENTRATION FIELD ! //
//Evaluating field values on finite volume boundary
dpsi_dt_R = (dpsi_dt(i + 1, j) + dpsi_dt(i, j)) / 2;
dpsi_dt_L = (dpsi_dt(i, j) + dpsi_dt(i - 1, j)) / 2;
dpsi_dt_T = (dpsi_dt(i, j + 1) + dpsi_dt(i, j)) / 2;
dpsi_dt_B = (dpsi_dt(i, j) + dpsi_dt(i, j - 1)) / 2;
psi_R = (psi_old(i + 1, j) + psi_old(i, j)) / 2;
psi_L = (psi_old(i, j) + psi_old(i - 1, j)) / 2;
psi_T = (psi_old(i, j + 1) + psi_old(i, j)) / 2;
psi_B = (psi_old(i, j) + psi_old(i, j - 1)) / 2;
C_R = (C_old(i + 1, j) + C_old(i, j)) / 2;
C_L = (C_old(i, j) + C_old(i - 1, j)) / 2;
C_T = (C_old(i, j + 1) + C_old(i, j)) / 2;
C_B = (C_old(i, j) + C_old(i, j - 1)) / 2;
T_R = (T_old(i + 1, j)+k_0(i+1,j) + T_old(i, j)+k_0(i,j)) / 2;
T_L = (T_old(i, j)+k_0(i,j) + T_old(i - 1, j)+k_0(i-1,j)) / 2;
T_T = (T_old(i, j + 1)+k_0(i,j+1) + T_old(i, j)+k_0(i,j)) / 2;
T_B = (T_old(i, j)+k_0(i,j) + T_old(i, j - 1)+k_0(i,j-1)) / 2;
Psi_cipjp = (psi_old(i + 1, j + 1) + psi_old(i, j + 1) + psi_old(i, j) + psi_old(i + 1, j)) / 4;
Psi_cipjm = (psi_old(i + 1, j) + psi_old(i, j) + psi_old(i, j - 1) + psi_old(i + 1, j - 1)) / 4;
Psi_cimjp = (psi_old(i, j + 1) + psi_old(i - 1, j + 1) + psi_old(i - 1, j) + psi_old(i, j)) / 4;
Psi_cimjm = (psi_old(i, j) + psi_old(i - 1, j) + psi_old(i - 1, j - 1) + psi_old(i, j - 1)) / 4;
// Calculating right edge flux for anti-trapping term
g = pow(psi_R, 2) * pow((1 - psi_R), 2);
gprime = 2 * psi_R * (1 - psi_R) * (1 - 2 * psi_R);
HA = Wa * gprime + 30 * g * H_A * (1 / T_R - 1 / Tm_A);
HB = Wb * gprime + 30 * g * H_B * (1 / T_R - 1 / Tm_B);
DERX = (psi_old(i + 1, j) - psi_old(i, j)) / dx;
DERY = (Psi_cipjp - Psi_cipjm) / dx;
DERX_C = (C_old(i + 1, j) - C_old(i, j)) / dx;
Mag2 = pow(DERX, 2) + pow(DERY, 2);
JR = DERX_C + Vm / R * C_R * (1 - C_R) * (HB - HA) * DERX;
JR_a = 0;
if (Mag2 > eps) {
JR_a = a * lambda * (1 - partition) * 2 * C_R / (1 + partition - (1 - partition) * psi_R) * dpsi_dt_R * DERX / sqrt(Mag2);
}
// Calculating left edge flux for anti-trapping term
g = pow(psi_L, 2) * pow((1 - psi_L), 2);
gprime = 2 * psi_L * (1 - psi_L) * (1 - 2 * psi_L);
HA = Wa * gprime + 30 * g * H_A * (1 / T_L - 1 / Tm_A);
HB = Wb * gprime + 30 * g * H_B * (1 / T_L - 1 / Tm_B);
DERX = (psi_old(i, j) - psi_old(i - 1, j)) / dx;
DERY = (Psi_cimjp - Psi_cimjm) / dx;
DERX_C = (C_old(i, j) - C_old(i - 1, j)) / dx;
Mag2 = pow(DERX, 2) + pow(DERY, 2);
JL = DERX_C + Vm / R * C_L * (1 - C_L) * (HB - HA) * DERX;
JL_a = 0;
if (Mag2 > eps) {
JL_a = a * lambda * (1 - partition) * 2 * C_L / (1 + partition - (1 - partition) * psi_L) * dpsi_dt_L * DERX / sqrt(Mag2);
}
// Calculating top edge flux for anti-trapping term
g = pow(psi_T, 2) * pow((1 - psi_T), 2);
gprime = 2 * psi_T * (1 - psi_T) * (1 - 2 * psi_T);
HA = Wa * gprime + 30 * g * H_A * (1 / T_T - 1 / Tm_A);
HB = Wb * gprime + 30 * g * H_B * (1 / T_T - 1 / Tm_B);
DERY = (psi_old(i, j + 1) - psi_old(i, j)) / dx;
DERX = (Psi_cipjp - Psi_cimjp) / dx;
DERY_C = (C_old(i, j + 1) - C_old(i, j)) / dx;
Mag2 = pow(DERX, 2) + pow(DERY, 2);
JT = DERY_C + Vm / R * C_T * (1 - C_T) * (HB - HA) * DERY;
JT_a = 0;
if (Mag2 > eps) {
JT_a = a * lambda * (1 - partition) * 2 * C_T / (1 + partition - (1 - partition) * psi_T) * dpsi_dt_T * DERY / sqrt(Mag2);
}
// Calculating bottom edge flux for anti-trapping term
g = pow(psi_B, 2) * pow((1 - psi_B), 2);
gprime = 2 * psi_B * (1 - psi_B) * (1 - 2 * psi_B);
HA = Wa * gprime + 30 * g * H_A * (1 / T_B - 1 / Tm_A);
HB = Wb * gprime + 30 * g * H_B * (1 / T_B - 1 / Tm_B);
DERY = (psi_old(i, j) - psi_old(i, j - 1)) / dx;
DERX = (Psi_cipjm - Psi_cimjm) / dx;
DERY_C = (C_old(i, j) - C_old(i, j - 1)) / dx;
Mag2 = pow(DERX, 2) + pow(DERY, 2);
JB = DERY_C + Vm / R * C_B * (1 - C_B) * (HB - HA) * DERY;
JB_a = 0;
if (Mag2 > eps) {
JB_a = a * lambda * (1 - partition) * 2 * C_B / (1 + partition - (1 - partition) * psi_B) * dpsi_dt_B * DERY / sqrt(Mag2);
}
// Update the concentration C
DR = D_s + pow(psi_R, 3) * (10 - 15 * psi_R + 6 * pow(psi_R, 2)) * (D_l - D_s);
DL = D_s + pow(psi_L, 3) * (10 - 15 * psi_L + 6 * pow(psi_L, 2)) * (D_l - D_s);
DT = D_s + pow(psi_T, 3) * (10 - 15 * psi_T + 6 * pow(psi_T, 2)) * (D_l - D_s);
DB = D_s + pow(psi_B, 3) * (10 - 15 * psi_B + 6 * pow(psi_B, 2)) * (D_l - D_s);
C_new(i, j) = C_old(i, j) + dt / dx * (DR * (JR + JR_a) - DL * (JL + JL_a) + DT * (JT + JT_a) - DB * (JB + JB_a));
}
}
for(int j=0;j<Nx+1;++j){
C_new(Nx + 1, j) = C_new(Nx - 1, j);
C_new(0, j) = C_new(2, j);
C_new(j, Ny + 1) = C_new(j, Ny - 1);
C_new(j, 0) = C_new(j, 2);
psi_new(Nx + 1, j) = psi_new(Nx - 1, j);
psi_new(0, j) = psi_new(2, j);
psi_new(j, Ny + 1) = psi_new(j, Ny - 1);
psi_new(j, 0) = psi_new(j, 2);
}
//UPDATING THE TEMPERATURE EQUATION!//
//Finte volume with explicit Euler
// KR = (1 - C_R) * K_A + C_R * K_B;
// KL = (1 - C_L) * K_A + C_L * K_B;
// KT = (1 - C_T) * K_A + C_T * K_B;
// KB = (1 - C_B) * K_A + C_B * K_B;
//
// //calculating right edge flux for the temperature field
//
// DERX_T = (T_old(i + 1, j) - T_old(i, j)) / dx;
// JR = KR * DERX_T;
//
// //calculating left edge flux for the temperature field
//
// DERX_T = (T_old(i, j) - T_old(i - 1, j)) / dx;
// JL = KL * DERX_T;
//
// //calculating top edge flux for the temperature field
//
// DERY_T = (T_old(i, j + 1) - T_old(i, j)) / dx;
// JT = KT * DERY_T;
//
// //calculating bottom edge flux for the temperature field
//
// DERY_T = (T_old(i, j) - T_old(i, j - 1)) / dx;
// JB = KB * DERY_T;
//
// cp = (1 - C_old(i, j)) * cp_A + C_old(i, j) * cp_B;
// Htilde = (1 - C_old(i, j)) * H_A + C_old(i, j) * H_B;
// g = pow(psi_old(i, j), 2) * pow((1 - psi_old(i, j)), 2);
//
//
// T_new(i,j) = dt / (cp * dx * dx) * (dx * (JR - JL + JT - JB) - dx * dx * 30 * g * Htilde * dpsi_dt(i, j)) + T_old(i, j);
//Finite difference
if(iter<4){
for (int i = 1; i <= Nx; ++i) {
for (int j = 1; j <= Ny; ++j) {
K=(1-C_new(i,j))*K_A+C_new(i,j)*K_B;
DERX_C=(C_new(i+1,j)-C_new(i,j))/dx;
DERY_C=(C_new(i,j+1)-C_new(i,j))/dx;
DERX_T=(T_old(i+1,j)+k_0(i+1,j)-T_old(i,j)+k_0(i,j))/dx;
DERY_T=(T_old(i,j+1)+k_0(i,j+1)-T_old(i,j)+k_0(i,j))/dx;
cp = (1 - C_new(i, j)) * cp_A + C_new(i, j) * cp_B;
Htilde = (1 - C_new(i, j)) * H_A + C_new(i, j) * H_B;
g = pow(psi_new(i, j), 2) * pow((1 - psi_new(i, j)), 2);
Nabla=1/pow(dx,2)*(0.5*(T_old(i+1,j)+k_0(i+1,j)+T_old(i-1,j)+k_0(i-1,j)+T_old(i,j+1)+k_0(i,j+1)+T_old(i,j-1)+k_0(i,j-1))+0.25*(T_old(i+1,j+1)+k_0(i+1,j+1)+T_old(i+1,j-1)+k_0(i+1,j-1)
+T_old(i-1,j+1)+k_0(i-1,j+1)+T_old(i-1,j-1)+k_0(i-1,j+1))-3*T_old(i,j)+k_0(i,j));
if(iter==0){
k1=1/cp*((K_B-K_A)*(DERX_C*DERX_T+DERY_C*DERY_T)+K*Nabla-30*g*Htilde*dpsi_dt(i,j));
k_1(i,j)=k1;
}else
if(iter==1){
k2=1/cp*((K_B-K_A)*(DERX_C*DERX_T+DERY_C*DERY_T)+K*Nabla-30*g*Htilde*dpsi_dt(i,j));
k_2(i,j)=k2;
}else
if(iter==2){
k3=1/cp*((K_B-K_A)*(DERX_C*DERX_T+DERY_C*DERY_T)+K*Nabla-30*g*Htilde*dpsi_dt(i,j));
k_3(i,j)=k3;
}else
if(iter==3){
k4=1/cp*((K_B-K_A)*(DERX_C*DERX_T+DERY_C*DERY_T)+K*Nabla-30*g*Htilde*dpsi_dt(i,j));
k_4(i,j)=k4;
//std::cout<<"k_1="<<k_1<<"\n"<<"k_2="<<k_2<<"\n"<<"k_3="<<k_3<<"\n"<<"k_4="<<k_4<<std::endl;
T_new(i,j)=T_old(i,j)+dt*(b1*k_1(i,j)+b2*k_2(i,j)+b3*k_3(i,j)+b4*k_4(i,j));
}
}
}
}
iter++;
}
“有可能以一种简单的方式并行化我的代码吗?”
您的代码并不简单,因此如果不花费太多时间,任何人都无法回答。
“我的所有变量都在您在此处看到的代码之前声明。”
我发现使用OpenMP要做相反的事情是最容易的。 每当OpenMP中有并行代码段时,您实际上都需要考虑同时发生的所有情况。 因此,如果声明一个变量,则该变量现在有n个副本。 如果您尝试写入在并行部分之外声明的变量,则n种不同的尝试都试图一次写入该资源。
如果您想使OpenMP变得简单,请使线程本地化尽可能多。 (也就是在该循环内声明将在循环中使用的所有变量)。 如果这不符合您的需要,请研究如何使用OpenMP的reduction
子句创建变量的本地副本,这些副本将在并行部分的末尾进行组合(通过加法,乘法等)以创建最终值,即代表所有线程的结果。 作为最后的选择,请在并行部分中引用外部资源,但是您可能需要在代码中添加critical
或atomic
注释,以确保一次只有一个线程在执行该部分代码。
要处理大型数组,可以更容易地将中间结果存储在为并行区域内的每个线程分配的子数组中,然后在每个子进程的末尾拥有一个最终的单线程代码段。数组和将结果适当存储回大型数组的句柄。
与所有其他内容一样,请确保您使用某种计时机制来确保您的更改实际上在加快速度! 我建议std :: chrono :: steady_clock,只要您要计时的代码区域要花几毫秒才能运行。
由于您遗漏了关键信息(例如算法的功能和所有变量的含义),因此我无法真正为您提供帮助,但是我可以为您提供一般的并行化建议。
首先,您需要使用性能分析器,找出代码中最耗时的部分。 我打赌这是for (int i = 1; i <= Nx; ++i)
for (int j = 1; j <= Ny; ++j)
部分,但我们需要确定。 假设您进行了分析,我是对的。 下一步是什么?
现在,您已在外部范围中声明了所有变量。 你就是做不到。 您修改的每个变量/指针都需要在并行函数/循环的范围内声明。 假设我对for循环是关键部分是正确的,我是说您的代码应该更像这样:
for (int i = 1; i <= Nx; ++i) {
for (int j = 1; j <= Ny; ++j) {
// Move all the declarations to local scope-- if this loop runs in parallel, each loop then has it's own variables to work with.
int Psi_cipjp = (psi_old(i + 1, j + 1) + psi_old(i, j + 1) + psi_old(i, j) + psi_old(i + 1, j)) / 4;
int Psi_cipjm = (psi_old(i + 1, j) + psi_old(i, j) + psi_old(i, j - 1) + psi_old(i + 1, j - 1)) / 4;
int Psi_cimjp = (psi_old(i, j + 1) + psi_old(i - 1, j + 1) + psi_old(i - 1, j) + psi_old(i, j)) / 4;
int Psi_cimjm = (psi_old(i, j) + psi_old(i - 1, j) + psi_old(i - 1, j - 1) + psi_old(i, j - 1)) / 4;
int DERX = (psi_old(i + 1, j) - psi_old(i, j)) / dx;
int DERY = (Psi_cipjp - Psi_cipjm) / dx;
/* and so on... */
}
}
但是您的算法似乎是一个复杂的算法,因此如果您发现必须认真思考一下实现以使其并行化,我不会感到惊讶。 我建议您在解决此问题之前尝试一些更简单的并行化问题。
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.