Parallel programming in c++ with openmp

Question

This is my very first time attempting to parallelize my code. This attempt seems to cause "race" and the code produces nonsense values. Is it possible to parallelize my code in a simple manner? I know that the code block is quite long, sorry for that. All my variables are declared before the code you see here. Really would appreciate your help on this!

int numthreads=2;
omp_set_num_threads(numthreads);
#pragma omp parallel for

for (int t = 1; t <= tmax; ++t) {
    iter=0;
    while(iter<5){

        switch(iter){
            case 1:
                for(int j=0;j<Nx+1;++j){
                    k_1(Nx + 1, j) = k_1(Nx - 1, j);
                    k_1(0, j) = k_1(2, j);
                    k_1(j, Ny + 1) = k_1(j, Ny - 1);
                    k_1(j, 0) = C_new(j, 2);
                }
                k_0=a2*dt*k_1;
                break;
            case 2:
                for(int j=0;j<Nx+1;++j){
                    k_2(Nx + 1, j) = k_2(Nx - 1, j);
                    k_2(0, j) = k_2(2, j);
                    k_2(j, Ny + 1) = k_2(j, Ny - 1);
                    k_2(j, 0) = C_new(j, 2);
                }
                k_0=a3*dt*k_2;
                break;
            case 3:
                for(int j=0;j<Nx+1;++j){
                    k_3(Nx + 1, j) = k_3(Nx - 1, j);
                    k_3(0, j) = k_3(2, j);
                    k_3(j, Ny + 1) = k_3(j, Ny - 1);
                    k_3(j, 0) = k_3(j, 2);
                }
                k_0=a4*dt*k_3;
                break;
            case 4:
                k_0.fill(0);
                break;
        }



        for (int i = 1; i <= Nx; ++i) {
            for (int j = 1; j <= Ny; ++j) {

                // Computing ghost nodes values around (i,j)

                //Order parameter
                Psi_cipjp = (psi_old(i + 1, j + 1) + psi_old(i, j + 1) + psi_old(i, j) + psi_old(i + 1, j)) / 4;
                Psi_cipjm = (psi_old(i + 1, j) + psi_old(i, j) + psi_old(i, j - 1) + psi_old(i + 1, j - 1)) / 4;
                Psi_cimjp = (psi_old(i, j + 1) + psi_old(i - 1, j + 1) + psi_old(i - 1, j) + psi_old(i, j)) / 4;
                Psi_cimjm = (psi_old(i, j) + psi_old(i - 1, j) + psi_old(i - 1, j - 1) + psi_old(i, j - 1)) / 4;


                // UPDATING THE ORDER PARAMETER PSI !//

                // Calculating right edge flux JR

                DERX = (psi_old(i + 1, j) - psi_old(i, j)) / dx;
                DERY = (Psi_cipjp - Psi_cipjm) / dx;

                // Setting anisotropy parameters

                aniso(DERX, DERY, a_s, a_12, eps, epsilon);
                JR = Atheta * (Atheta * DERX + Aptheta * DERY);

                // Calculating left edge flux JL

                DERX = (psi_old(i, j) - psi_old(i - 1, j)) / dx;
                DERY = (Psi_cimjp - Psi_cimjm) / dx;

                // Setting anisotropy parameters

                aniso(DERX, DERY, a_s, a_12, eps, epsilon);
                JL = Atheta * (Atheta * DERX + Aptheta * DERY);

                // Calculating top edge flux JT

                DERY = (psi_old(i, j + 1) - psi_old(i, j)) / dx;
                DERX = (Psi_cipjp - Psi_cimjp) / dx;

                // Setting anisotropy parameters

                aniso(DERX, DERY, a_s, a_12, eps, epsilon);
                JT = Atheta * (Atheta * DERY - Aptheta * DERX);

                // Calculating bottom edge flux JB

                DERY = (psi_old(i, j) - psi_old(i, j - 1)) / dx;
                DERX = (Psi_cipjm - Psi_cimjm) / dx;

                // Setting anisotropy parameters

                aniso(DERX, DERY, a_s, a_12, eps, epsilon);
                JB = Atheta * (Atheta * DERY - Aptheta * DERX);


                // Update psi
                M = (1 - C_old(i, j)) * Ma + C_old(i, j) * Mb;
                g = pow(psi_old(i, j), 2) * pow((1 - psi_old(i, j)), 2);
                gprime = 2 * psi_old(i, j) * (1 - psi_old(i, j)) * (1 - 2 * psi_old(i, j));
                HA = Wa * gprime + 30 * g * H_A * (1 /( T_old(i, j)+k_0(i,j)) - 1 / Tm_A);
                HB = Wb * gprime + 30 * g * H_B * (1 / (T_old(i, j)+k_0(i,j)) - 1 / Tm_B);
                H = (1 - C_old(i, j)) * HA + C_old(i, j) * HB;
                rand=distr(gen);
                Noise=M*A_noise*rand*16*g*((1-C_old(i,j))*HA+C_old(i,j)*HB);

                dpsi=(dt / dx) * ((JR - JL + JT - JB) * M * Epsilon2 - dx * M * H-dx*Noise);
                psi_new(i, j) = psi_old(i, j) + dpsi;
                dpsi_dt(i, j) = dpsi/ dt;
                //std::cout<<"dpsi_dt="<<dpsi_dt(i,j)<<std::endl;

                // UPDATING THE CONCENTRATION FIELD ! //

                //Evaluating field values on finite volume boundary

                dpsi_dt_R = (dpsi_dt(i + 1, j) + dpsi_dt(i, j)) / 2;
                dpsi_dt_L = (dpsi_dt(i, j) + dpsi_dt(i - 1, j)) / 2;
                dpsi_dt_T = (dpsi_dt(i, j + 1) + dpsi_dt(i, j)) / 2;
                dpsi_dt_B = (dpsi_dt(i, j) + dpsi_dt(i, j - 1)) / 2;
                psi_R = (psi_old(i + 1, j) + psi_old(i, j)) / 2;
                psi_L = (psi_old(i, j) + psi_old(i - 1, j)) / 2;
                psi_T = (psi_old(i, j + 1) + psi_old(i, j)) / 2;
                psi_B = (psi_old(i, j) + psi_old(i, j - 1)) / 2;
                C_R = (C_old(i + 1, j) + C_old(i, j)) / 2;
                C_L = (C_old(i, j) + C_old(i - 1, j)) / 2;
                C_T = (C_old(i, j + 1) + C_old(i, j)) / 2;
                C_B = (C_old(i, j) + C_old(i, j - 1)) / 2;
                T_R = (T_old(i + 1, j)+k_0(i+1,j) + T_old(i, j)+k_0(i,j)) / 2;
                T_L = (T_old(i, j)+k_0(i,j) + T_old(i - 1, j)+k_0(i-1,j)) / 2;
                T_T = (T_old(i, j + 1)+k_0(i,j+1) + T_old(i, j)+k_0(i,j)) / 2;
                T_B = (T_old(i, j)+k_0(i,j) + T_old(i, j - 1)+k_0(i,j-1)) / 2;
                Psi_cipjp = (psi_old(i + 1, j + 1) + psi_old(i, j + 1) + psi_old(i, j) + psi_old(i + 1, j)) / 4;
                Psi_cipjm = (psi_old(i + 1, j) + psi_old(i, j) + psi_old(i, j - 1) + psi_old(i + 1, j - 1)) / 4;
                Psi_cimjp = (psi_old(i, j + 1) + psi_old(i - 1, j + 1) + psi_old(i - 1, j) + psi_old(i, j)) / 4;
                Psi_cimjm = (psi_old(i, j) + psi_old(i - 1, j) + psi_old(i - 1, j - 1) + psi_old(i, j - 1)) / 4;

                // Calculating right edge flux for anti-trapping term

                g = pow(psi_R, 2) * pow((1 - psi_R), 2);
                gprime = 2 * psi_R * (1 - psi_R) * (1 - 2 * psi_R);
                HA = Wa * gprime + 30 * g * H_A * (1 / T_R - 1 / Tm_A);
                HB = Wb * gprime + 30 * g * H_B * (1 / T_R - 1 / Tm_B);

                DERX = (psi_old(i + 1, j) - psi_old(i, j)) / dx;
                DERY = (Psi_cipjp - Psi_cipjm) / dx;
                DERX_C = (C_old(i + 1, j) - C_old(i, j)) / dx;
                Mag2 = pow(DERX, 2) + pow(DERY, 2);
                JR = DERX_C + Vm / R * C_R * (1 - C_R) * (HB - HA) * DERX;

                JR_a = 0;

                if (Mag2 > eps) {
                    JR_a = a * lambda * (1 - partition) * 2 * C_R / (1 + partition - (1 - partition) * psi_R) * dpsi_dt_R * DERX / sqrt(Mag2);
                }

                // Calculating left edge flux for anti-trapping term

                g = pow(psi_L, 2) * pow((1 - psi_L), 2);
                gprime = 2 * psi_L * (1 - psi_L) * (1 - 2 * psi_L);
                HA = Wa * gprime + 30 * g * H_A * (1 / T_L - 1 / Tm_A);
                HB = Wb * gprime + 30 * g * H_B * (1 / T_L - 1 / Tm_B);

                DERX = (psi_old(i, j) - psi_old(i - 1, j)) / dx;
                DERY = (Psi_cimjp - Psi_cimjm) / dx;
                DERX_C = (C_old(i, j) - C_old(i - 1, j)) / dx;
                Mag2 = pow(DERX, 2) + pow(DERY, 2);
                JL = DERX_C + Vm / R * C_L * (1 - C_L) * (HB - HA) * DERX;

                JL_a = 0;

                if (Mag2 > eps) {
                    JL_a = a * lambda * (1 - partition) * 2 * C_L / (1 + partition - (1 - partition) * psi_L) * dpsi_dt_L * DERX / sqrt(Mag2);
                }

                // Calculating top edge flux for anti-trapping term

                g = pow(psi_T, 2) * pow((1 - psi_T), 2);
                gprime = 2 * psi_T * (1 - psi_T) * (1 - 2 * psi_T);
                HA = Wa * gprime + 30 * g * H_A * (1 / T_T - 1 / Tm_A);
                HB = Wb * gprime + 30 * g * H_B * (1 / T_T - 1 / Tm_B);

                DERY = (psi_old(i, j + 1) - psi_old(i, j)) / dx;
                DERX = (Psi_cipjp - Psi_cimjp) / dx;
                DERY_C = (C_old(i, j + 1) - C_old(i, j)) / dx;
                Mag2 = pow(DERX, 2) + pow(DERY, 2);
                JT = DERY_C + Vm / R * C_T * (1 - C_T) * (HB - HA) * DERY;

                JT_a = 0;

                if (Mag2 > eps) {
                    JT_a = a * lambda * (1 - partition) * 2 * C_T / (1 + partition - (1 - partition) * psi_T) * dpsi_dt_T * DERY / sqrt(Mag2);
                }

                // Calculating bottom edge flux for anti-trapping term

                g = pow(psi_B, 2) * pow((1 - psi_B), 2);
                gprime = 2 * psi_B * (1 - psi_B) * (1 - 2 * psi_B);
                HA = Wa * gprime + 30 * g * H_A * (1 / T_B - 1 / Tm_A);
                HB = Wb * gprime + 30 * g * H_B * (1 / T_B - 1 / Tm_B);

                DERY = (psi_old(i, j) - psi_old(i, j - 1)) / dx;
                DERX = (Psi_cipjm - Psi_cimjm) / dx;
                DERY_C = (C_old(i, j) - C_old(i, j - 1)) / dx;
                Mag2 = pow(DERX, 2) + pow(DERY, 2);
                JB = DERY_C + Vm / R * C_B * (1 - C_B) * (HB - HA) * DERY;

                JB_a = 0;

                if (Mag2 > eps) {
                    JB_a = a * lambda * (1 - partition) * 2 * C_B / (1 + partition - (1 - partition) * psi_B) * dpsi_dt_B * DERY / sqrt(Mag2);
                }

                // Update the concentration C

                DR = D_s + pow(psi_R, 3) * (10 - 15 * psi_R + 6 * pow(psi_R, 2)) * (D_l - D_s);
                DL = D_s + pow(psi_L, 3) * (10 - 15 * psi_L + 6 * pow(psi_L, 2)) * (D_l - D_s);
                DT = D_s + pow(psi_T, 3) * (10 - 15 * psi_T + 6 * pow(psi_T, 2)) * (D_l - D_s);
                DB = D_s + pow(psi_B, 3) * (10 - 15 * psi_B + 6 * pow(psi_B, 2)) * (D_l - D_s);

                C_new(i, j) = C_old(i, j) + dt / dx * (DR * (JR + JR_a) - DL * (JL + JL_a) + DT * (JT + JT_a) - DB * (JB + JB_a));


            }
        }

        for(int j=0;j<Nx+1;++j){
            C_new(Nx + 1, j) = C_new(Nx - 1, j);
            C_new(0, j) = C_new(2, j);
            C_new(j, Ny + 1) = C_new(j, Ny - 1);
            C_new(j, 0) = C_new(j, 2);
            psi_new(Nx + 1, j) = psi_new(Nx - 1, j);
            psi_new(0, j) = psi_new(2, j);
            psi_new(j, Ny + 1) = psi_new(j, Ny - 1);
            psi_new(j, 0) = psi_new(j, 2);
        }




        //UPDATING THE TEMPERATURE EQUATION!//

        //Finte volume with explicit Euler


        //                  KR = (1 - C_R) * K_A + C_R * K_B;
        //                  KL = (1 - C_L) * K_A + C_L * K_B;
        //                  KT = (1 - C_T) * K_A + C_T * K_B;
        //                  KB = (1 - C_B) * K_A + C_B * K_B;
        //  
        //                 //calculating right edge flux for the temperature field
        //                 
        //                  DERX_T = (T_old(i + 1, j) - T_old(i, j)) / dx;
        //                  JR = KR * DERX_T;
        // 
        //                  //calculating left edge flux for the temperature field
        //  
        //                  DERX_T = (T_old(i, j) - T_old(i - 1, j)) / dx;
        //                  JL = KL * DERX_T;
        //  
        //                  //calculating top edge flux for the temperature field
        //  
        //                  DERY_T = (T_old(i, j + 1) - T_old(i, j)) / dx;
        //                  JT = KT * DERY_T;
        //  
        //                  //calculating bottom edge flux for the temperature field
        //  
        //                  DERY_T = (T_old(i, j) - T_old(i, j - 1)) / dx;
        //                  JB = KB * DERY_T;
        //                  
        //                  cp = (1 - C_old(i, j)) * cp_A + C_old(i, j) * cp_B;
        //                  Htilde = (1 - C_old(i, j)) * H_A + C_old(i, j) * H_B;
        //                  g = pow(psi_old(i, j), 2) * pow((1 - psi_old(i, j)), 2);
        //                  
        //                  
        //                  T_new(i,j) = dt / (cp * dx * dx) * (dx * (JR - JL + JT - JB) - dx * dx * 30 * g * Htilde * dpsi_dt(i, j)) + T_old(i, j);


        //Finite difference

        if(iter<4){
            for (int i = 1; i <= Nx; ++i) {
                for (int j = 1; j <= Ny; ++j) {

                    K=(1-C_new(i,j))*K_A+C_new(i,j)*K_B;
                    DERX_C=(C_new(i+1,j)-C_new(i,j))/dx;
                    DERY_C=(C_new(i,j+1)-C_new(i,j))/dx;
                    DERX_T=(T_old(i+1,j)+k_0(i+1,j)-T_old(i,j)+k_0(i,j))/dx;
                    DERY_T=(T_old(i,j+1)+k_0(i,j+1)-T_old(i,j)+k_0(i,j))/dx;

                    cp = (1 - C_new(i, j)) * cp_A + C_new(i, j) * cp_B;
                    Htilde = (1 - C_new(i, j)) * H_A + C_new(i, j) * H_B;
                    g = pow(psi_new(i, j), 2) * pow((1 - psi_new(i, j)), 2);

                    Nabla=1/pow(dx,2)*(0.5*(T_old(i+1,j)+k_0(i+1,j)+T_old(i-1,j)+k_0(i-1,j)+T_old(i,j+1)+k_0(i,j+1)+T_old(i,j-1)+k_0(i,j-1))+0.25*(T_old(i+1,j+1)+k_0(i+1,j+1)+T_old(i+1,j-1)+k_0(i+1,j-1)
                    +T_old(i-1,j+1)+k_0(i-1,j+1)+T_old(i-1,j-1)+k_0(i-1,j+1))-3*T_old(i,j)+k_0(i,j));


                    if(iter==0){
                        k1=1/cp*((K_B-K_A)*(DERX_C*DERX_T+DERY_C*DERY_T)+K*Nabla-30*g*Htilde*dpsi_dt(i,j));
                        k_1(i,j)=k1;
                    }else
                        if(iter==1){
                            k2=1/cp*((K_B-K_A)*(DERX_C*DERX_T+DERY_C*DERY_T)+K*Nabla-30*g*Htilde*dpsi_dt(i,j));
                            k_2(i,j)=k2;
                        }else
                            if(iter==2){
                                k3=1/cp*((K_B-K_A)*(DERX_C*DERX_T+DERY_C*DERY_T)+K*Nabla-30*g*Htilde*dpsi_dt(i,j));
                                k_3(i,j)=k3;
                            }else
                                if(iter==3){
                                    k4=1/cp*((K_B-K_A)*(DERX_C*DERX_T+DERY_C*DERY_T)+K*Nabla-30*g*Htilde*dpsi_dt(i,j));
                                    k_4(i,j)=k4;
                                    //std::cout<<"k_1="<<k_1<<"\n"<<"k_2="<<k_2<<"\n"<<"k_3="<<k_3<<"\n"<<"k_4="<<k_4<<std::endl;
                                    T_new(i,j)=T_old(i,j)+dt*(b1*k_1(i,j)+b2*k_2(i,j)+b3*k_3(i,j)+b4*k_4(i,j));
                                }



                }         
            }

        }
        iter++;
    }

Answer 1

"Is it possible to parallelize my code in a simple manner?"

Your code is not simple, so no one will be able to answer that without investing way too much time.

"All my variables are declared before the code you see here."

I found that with OpenMP it's easiest to do exactly the opposite of this. Whenever you have a parallel section of code in OpenMP, you really need to think about everything in there happening multiple times, concurrently. So if you declare a variable, there are now n copies of that variable. If you try to write to a variable declared outside of the parallel section, n different things are trying to write to that resource at once.

If you want to make OpenMP easy, keep as many things thread-local as possible. (AKA declare all variables you'll use in a loop, within that loop). When that doesn't fit what you need, look into how to use OpenMP's reduction clause to create local copies of variables that will be combined at the end of the parallel section (through addition, multiplication, etc) to create a final value that is representative of the outcome of all of the threads. As a last resort, reference outside resources in the parallel section, but you'll probably need critical or atomic notes in the code to ensure that only one thread is executing that portion of code at a time.

For manipulating large arrays, it can be easier to store intermediate results in sub-arrays allocated for each thread within the parallel region, and then have one, final, single-threaded piece of code at the end that runs through each of the sub-arrays and handles storing the results appropriately back into the large array.

And as with everything, make sure you are using some sort of timing mechanism to ensure your changes are actually speeding it up! I recommend std::chrono::steady_clock as long as the code regions you are timing taking more than a few ms to run.

Answer 2

I can't really help you since you've left out key information like what the algorithm does and what all the variables are, but I can give you general parallelization advice.

First off, you need to use a performance profiler and find out what parts of your code are the most time-consuming. I'd wager it's the for (int i = 1; i <= Nx; ++i) for (int j = 1; j <= Ny; ++j) section but we need to know for sure. Let's assume you do the profiling and I'm right. What's the next step?

Right now you have all of your variables declared in the outer scope. You just can't do this. Every variable/pointer you modify needs to be declared within the scope of your parallelized function/loop. Assuming I'm right about the for loop being the critical section, I'm saying your code should look more like this:

for (int i = 1; i <= Nx; ++i) {
    for (int j = 1; j <= Ny; ++j) {
        // Move all the declarations to local scope-- if this loop runs in parallel, each loop then has it's own variables to work with.
        int Psi_cipjp = (psi_old(i + 1, j + 1) + psi_old(i, j + 1) + psi_old(i, j) + psi_old(i + 1, j)) / 4;
        int Psi_cipjm = (psi_old(i + 1, j) + psi_old(i, j) + psi_old(i, j - 1) + psi_old(i + 1, j - 1)) / 4;
        int Psi_cimjp = (psi_old(i, j + 1) + psi_old(i - 1, j + 1) + psi_old(i - 1, j) + psi_old(i, j)) / 4;
        int Psi_cimjm = (psi_old(i, j) + psi_old(i - 1, j) + psi_old(i - 1, j - 1) + psi_old(i, j - 1)) / 4;

        int DERX = (psi_old(i + 1, j) - psi_old(i, j)) / dx;
        int DERY = (Psi_cipjp - Psi_cipjm) / dx;

        /* and so on... */
    }
}

But your algorithm appears to be a complex one so I wouldn't be surprised if you find that you'll have to seriously rethink the implementation in order to parallize it. I'd recommend you attempt somewhat simpler parallelization problems before tackling this one.

Parallel programming in c++ with openmp

Question

2 answers

solution1
1 2015-06-24 15:20:28

solution2
1 ACCPTED 2015-06-24 15:32:53

Parallel programming in c++ with openmp

Question

2 answers

solution1 1 2015-06-24 15:20:28

solution2 1 ACCPTED 2015-06-24 15:32:53

solution1
1 2015-06-24 15:20:28

solution2
1 ACCPTED 2015-06-24 15:32:53