C OpenMP parallel for loops make it much slower than single thread

Question

I am new to OpenMP in C. I used it to parallel my for loops in a function, but turns out it drastically slow down my for loops in compare with single thread case. For example, the for loop can be done around 10s for each point(halo) but it takes a few minutes with OpenMP.

In this function, I am trying to calculate the density of many shells for each point(halo), by counting particles that inside the shell, and then wirte them out into an array. There are 512^3 particles, and around 200 points(halos) that I want to calculate. I want to split the points(halos) for different threads to make it faster.

#define ArrayAccess2D_n2(a, n1, n2, i1, i2) (a)[ i2+n2*i1 ]


void halo_shell_rho(float boxsize, float *halo_pos, float *halo_R, int halo_number,\ 
int halo_start, int halo_end, float *par_pos, long long par_number,\ 
int shell_bins, float rmax_fac, float *out_shell_den){

    float temp;

    long long iter_sfs=0, iter_sfc=0, iter_ufs=0, iter_ufc=0;
    int dim=3;

    float par_posx, par_posy, par_posz, dist;
    float halo_posx, halo_posy, halo_posz, halo_rad;
    int i=0, ini_j=0, vol_j=0;
    int a=0, b=0;
    long long k=0;

    #pragma omp parallel for private(i, ini_j, vol_j, a, b, k)
    for(i=halo_start; i<=halo_end; i++){
            printf("halo %d\n", i);
            float count[shell_bins];
            float volume[shell_bins];

            for(ini_j=0; ini_j<shell_bins; ini_j++){
                    count[ini_j] = 0;
                    volume[ini_j] = 0; }

            halo_posx = ArrayAccess2D_n2(halo_pos, dim, halo_number, 0, i);
            halo_posy = ArrayAccess2D_n2(halo_pos, dim, halo_number, 1, i);
            halo_posz = ArrayAccess2D_n2(halo_pos, dim, halo_number, 2, i);
            halo_rad = halo_R[i];

            for(vol_j=0; vol_j<shell_bins; vol_j++){

                    volume[vol_j] = shell_volume((vol_j+1)*halo_rad*rmax_fac/(shell_bins*1000), vol_j*halo_rad*rmax_fac/(shell_bins*1000)); }

            for(k=0; k<par_number; k++){

                    par_posx = ArrayAccess2D_n2(par_pos, par_number, dim, k, 0);
                    par_posy = ArrayAccess2D_n2(par_pos, par_number, dim, k, 1);
                    par_posz = ArrayAccess2D_n2(par_pos, par_number, dim, k, 2);

                    dist = pb_distance(boxsize*1000, halo_posx, halo_posy, halo_posz, par_posx, par_posy, par_posz); //1000 for boxsize in Mpc

                    if(dist <= 2*rmax_fac*halo_rad){

                            for(a=0; a<shell_bins; a++){

                                    if((dist <= halo_rad*(a+1)*rmax_fac/shell_bins) && (dist >= halo_rad*a*rmax_fac/shell_bins)){

                                            count[a] += 1; }
                            }
                    }
            }

            for(b=0; b<shell_bins; b++){

            out_shell_den[(i-halo_start+0*(1+halo_end-halo_start))*shell_bins+b] = count[b]/volume[b]; 
            //out_shell_den has shape (2, halo_number, shell_bins), 0 for edge, 1 for density
            out_shell_den[(i-halo_start+1*(1+halo_end-halo_start))*shell_bins+b] = (2*b+1)*rmax_fac/(shell_bins*2);
            }
    }

}

Could anyone help me out with this? I know this is a super frequent question that is being asked but I didn't find out any solutions from other posts. I am running it on a cluster with 32 threads if that helps.

Thanks!

Answer 1

Thanks for @DavidSchwartz and @tim18.

Variables like halo_rad and par_posx are declared before the parallel, which means that they are implicitly assumed to be public. So it slows down because all the threads are fighting over the right of using them. One way to solve this is to add all the variables to private(). But I think the better way is to just declare variables inside the parallel like this:

void halo_shell_rho(float boxsize, float *halo_pos, float *halo_R, int halo_number, int halo_start, int halo_end, float *par_pos, long long par_number, int shell_bins, float rmax_fac, float *out_shell_den){

    int dim=3;
    int i=0, ini_j=0, vol_j=0, a=0, b=0;
    long long k=0;

    #pragma omp parallel for private(i, ini_j, vol_j, a, b, k)
    for(i=halo_start; i<=halo_end; i++){
            printf("halo %d\n", i);

            float halo_posx, halo_posy, halo_posz, halo_rad;
            float count[shell_bins];
            float volume[shell_bins];

            for(ini_j=0; ini_j<shell_bins; ini_j++){
                    count[ini_j] = 0;
                    volume[ini_j] = 0; }

            halo_posx = ArrayAccess2D_n2(halo_pos, dim, halo_number, 0, i);
            halo_posy = ArrayAccess2D_n2(halo_pos, dim, halo_number, 1, i);
            halo_posz = ArrayAccess2D_n2(halo_pos, dim, halo_number, 2, i);
            halo_rad = halo_R[i];

            for(vol_j=0; vol_j<shell_bins; vol_j++){

                    volume[vol_j] = shell_volume((vol_j+1)*halo_rad*rmax_fac/(shell_bins*1000), vol_j*halo_rad*rmax_fac/(shell_bins*1000)); }


            for(k=0; k<par_number; k++){
                    float par_posx, par_posy, par_posz, dist;

                    par_posx = ArrayAccess2D_n2(par_pos, par_number, dim, k, 0);
                    par_posy = ArrayAccess2D_n2(par_pos, par_number, dim, k, 1);
                    par_posz = ArrayAccess2D_n2(par_pos, par_number, dim, k, 2);

                    dist = pb_distance(boxsize*1000, halo_posx, halo_posy, halo_posz, par_posx, par_posy, par_posz); //1000 for boxsize in Mpc

                    if(dist <= 2*rmax_fac*halo_rad){

                            for(a=0; a<shell_bins; a++){

                                    if((dist <= halo_rad*(a+1)*rmax_fac/shell_bins) && (dist >= halo_rad*a*rmax_fac/shell_bins)){

                                            count[a] += 1; }
                            }
                    }
            }

            for(b=0; b<shell_bins; b++){

            out_shell_den[(i-halo_start+0*(1+halo_end-halo_start))*shell_bins+b] = count[b]/volume[b]; //out_shell_den has shape (2, halo_number, shell_bins), 0 for edge, 1 for density
            out_shell_den[(i-halo_start+1*(1+halo_end-halo_start))*shell_bins+b] = (2*b+1)*rmax_fac/(shell_bins*2);
            }
    }
}

C OpenMP parallel for loops make it much slower than single thread

Question

1 answers

solution1
0 2017-08-05 01:49:50

C OpenMP parallel for loops make it much slower than single thread

Question

1 answers

solution1 0 2017-08-05 01:49:50

solution1
0 2017-08-05 01:49:50