為什么openmp 32線程比1線程慢得多？

Question

我正在嘗試編寫一個計算 2 arrays 的 l2 范數的應用程序。 我必須並行計算。

這是我並行化的代碼：

  double time_start_openmp = omp_get_wtime();
  #pragma omp parallel for
  for (i = 0; i < n; i++)
  {
       numberOfThreads = omp_get_num_threads();
       double local_diff = x[i] - xseq[i];
       diff_vector[i] = local_diff;
       l2_norm += (local_diff * local_diff);
  }

   time_end_openmp = omp_get_wtime();

   l2_norm = sqrt(l2_norm);

   openmp_exec_time = time_end_openmp - time_start_openmp;
   printf("OPENMP: %d %ld %f %.12e\n", n, numberOfThreads, openmp_exec_time, l2_norm);

我將代碼編譯為：

gcc -fopenmp -g -ggdb -Wall -lm -o test test.c

我正在使用 1 個線程和 32 個線程運行此代碼。 output 與預期完全相反。 這是一個示例 output：

[hayri@hayri-durmaz MatrixMultipication_MPI]$ export OMP_NUM_THREADS=32
[hayri@hayri-durmaz MatrixMultipication_MPI]$ ./test 10000
OPENMP: 10000 32 0.001084 0.000000000000e+00
[hayri@hayri-durmaz MatrixMultipication_MPI]$ export OMP_NUM_THREADS=1
[hayri@hayri-durmaz MatrixMultipication_MPI]$ ./test 10000
OPENMP: 10000 1 0.000106 0.000000000000e+00

我看錯了還是使用 32 個線程比 1 個線程慢 10 倍？ 那么，我在這里做錯了什么？

這是我的完整代碼：

#include "mpi.h"
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <omp.h>
#include <math.h>

#define MATSIZE 2000

static size_t totalMemUsage = 0;

size_t vectors_dot_prod(double *x, double *y, size_t n)
{
    double res = 0.0;
    size_t i;
    for (i = 0; i < n; i++)
    {
        res += x[i] * y[i];
    }
    return res;
}

size_t vectors_dot_prod2(double *x, double *y, size_t n)
{
    size_t res = 0.0;
    size_t i = 0;
    for (; i <= n - 4; i += 4)
    {
        res += (x[i] * y[i] +
                x[i + 1] * y[i + 1] +
                x[i + 2] * y[i + 2] +
                x[i + 3] * y[i + 3]);
    }
    for (; i < n; i++)
    {
        res += x[i] * y[i];
    }
    return res;
}

void matrix_vector_mult(double **mat, double *vec, double *result, size_t rows, size_t cols)
{ // in matrix form: result = mat * vec;
    size_t i;
    for (i = 0; i < rows; i++)
    {
        result[i] = vectors_dot_prod2(mat[i], vec, cols);
    }
}

double get_random()
{

    double range = 1000;
    double div = RAND_MAX / range;
    double randomNumber = (rand() / div);
    // printf("%d\n", randomNumber);
    return randomNumber;
}

void print_2d_arr(double *arr, size_t row, size_t col)
{
    size_t i, j, index;

    for (i = 0; i < row; i++)
    {
        for (j = 0; j < col; j++)
        {
            index = i * col + j;
            printf("%3f ", arr[index]);
        }
        printf("\n");
    }
}
void print_1d_arr(double *arr, size_t row)
{
    size_t i;
    for (i = 0; i < row; i++)
    {
        printf("%f, ", arr[i]);
    }
    printf("\n");
}

size_t **fullfillArrayWithRandomNumbers(double *arr, size_t n)
{
    /*
    * Fulfilling the array with random numbers 
    * */
    size_t i;
    for (i = 0; i < n; i++)
    {
        arr[i] = get_random();
    }
    return 0;
}

double *allocarray1D(size_t size)
{
    double *array = calloc(size, sizeof(double));
    totalMemUsage = totalMemUsage + size * sizeof(double);
    return array;
}

size_t ParallelRowMatrixVectorMultiply(size_t n, double *a, double *b, double *x, MPI_Comm comm)
{
    size_t i, j;
    size_t nlocal;
    double *fb;
    int npes, myrank;
    MPI_Comm_size(comm, &npes);
    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
    fb = (double *)malloc(n * sizeof(double));
    nlocal = n / npes;
    MPI_Allgather(b, nlocal, MPI_DOUBLE, fb, nlocal, MPI_DOUBLE, comm);
    for (i = 0; i < nlocal; i++)
    {
        x[i] = 0.0;
        for (j = 0; j < n; j++)
        {
            size_t index = i * n + j;
            x[i] += a[index] * fb[j];
        }
    }
    free(fb);
    return 0;
}

size_t ParallelRowMatrixVectorMultiply_WithoutAllgather(size_t n, double *a, double *b, double *x_partial, double *x, MPI_Comm comm)
{

    // Process 0 sends b to everyone
    MPI_Bcast(b, n, MPI_DOUBLE, 0, MPI_COMM_WORLD);

    size_t i, j;
    size_t nlocal;
    // double *fb;
    int npes, myrank;
    MPI_Comm_size(comm, &npes);
    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
    // fb = (double *)malloc(n * sizeof(double));
    nlocal = n / npes;
    // MPI_Allgather(b, nlocal, MPI_DOUBLE, fb, nlocal, MPI_DOUBLE, comm);
    for (i = 0; i < nlocal; i++)
    {
        x_partial[i] = 0.0;
        for (j = 0; j < n; j++)
        {
            size_t index = i * n + j;
            // printf("%f x %f\n", a[index], b[j]);
            x_partial[i] += a[index] * b[j];
        }
    }
    // free(b);

    // Process 0 gathers x_partials to create x
    MPI_Gather(x_partial, nlocal, MPI_DOUBLE, x, nlocal, MPI_DOUBLE, 0, MPI_COMM_WORLD);
    return 0;
}

size_t SequentialMatrixMultiply(size_t n, double *a, double *b, double *x)
{
    size_t i, j;
    for (i = 0; i < n; i++)
    {
        x[i] = 0.0;
        for (j = 0; j < n; j++)
        {
            size_t index = i * n + j;
            // printf("%f x %f\n", a[index], b[j]);
            x[i] += a[index] * b[j];
        }
    }
    return 0;
}

int main(int argc, char *argv[])
{
    // Global declerations
    size_t i;
    // MPI_Status status;

    // Initialize the MPI environment
    MPI_Init(&argc, &argv);

    // Get the number of processes
    int world_size;
    MPI_Comm_size(MPI_COMM_WORLD, &world_size);

    // Get the rank of the process
    int taskid;
    MPI_Comm_rank(MPI_COMM_WORLD, &taskid);

    // Get the name of the processor
    char processor_name[MPI_MAX_PROCESSOR_NAME];
    int name_len;
    MPI_Get_processor_name(processor_name, &name_len);

    if (argc != 2)
    {
        if (taskid == 0)
            printf("Usage: %s <N>\n", argv[0]);
        MPI_Finalize();
        return 0;
    }
    srand(time(NULL) + taskid);
    size_t n = atoi(argv[1]);
    size_t nOverK = n / world_size;

    double *a = allocarray1D(n * n);
    double *b = allocarray1D(n);
    double *x = allocarray1D(n);
    double *x_partial = allocarray1D(nOverK);
    double *xseq = allocarray1D(n);

    double *a_partial = allocarray1D(n * nOverK);

    if (a == NULL || b == NULL || x == NULL || xseq == NULL || x_partial == NULL)
    {
        if (taskid == 0)
            printf("Allocation failed\n");
        MPI_Finalize();
        return 0;
    }
    // Process 0 creates A matrix.
    if (taskid == 0)
    {
        fullfillArrayWithRandomNumbers(a, n * n);
        // Process 0 produces the b
        fullfillArrayWithRandomNumbers(b, n);
    }

    // Process 0 sends a_partial to everyone
    if (!(world_size == 1 && n == 64000))
    {
        MPI_Scatter(a, n * nOverK, MPI_DOUBLE, a_partial, n * nOverK, MPI_DOUBLE, 0, MPI_COMM_WORLD);
    }

    MPI_Barrier(MPI_COMM_WORLD);
    double time_start = MPI_Wtime();
    ParallelRowMatrixVectorMultiply_WithoutAllgather(n, a_partial, b, x_partial, x, MPI_COMM_WORLD);
    double time_end = MPI_Wtime();
    double parallel_exec_time = time_end - time_start;

    double *exec_times = allocarray1D(world_size);
    // Process 0 gathers x_partials to create x
    MPI_Gather(&parallel_exec_time, 1, MPI_DOUBLE, exec_times, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
    // print_1d_arr(x, n);

    if (taskid == 0)
    {
        SequentialMatrixMultiply(n, a, b, xseq);
        // check difference between x and xseq using OpenMP
        //print_1d_arr(exec_times, world_size);
        // print_1d_arr(xseq, n);
        double max_exec, min_exec, avg_exec;
        min_exec = 1000;
        for (i = 0; i < world_size; i++)
        {
            if (max_exec < exec_times[i])
            {
                max_exec = exec_times[i];
            }
            if (min_exec > exec_times[i])
            {
                min_exec = exec_times[i];
            }
            avg_exec += exec_times[i];
        }
        avg_exec = avg_exec / world_size;

        long double time_start_openmp = omp_get_wtime();
        long double time_end_openmp, openmp_exec_time, min_exec_time, max_exec_time, avg_exec_time;
        max_exec_time = 0;
        max_exec_time = 1000;
        long double l2_norm = 0;
        size_t numberOfThreads = 0;
        size_t r = 0;
        double *diff_vector = allocarray1D(n);
        size_t nrepeat = 10000;

        if (world_size == 1)
        {
            #pragma omp parallel
            {
                numberOfThreads = omp_get_num_threads();
                #pragma omp parallel for private(i)
                for (i = 0; i < n; i++)
                {
                    double local_diff = x[i] - xseq[i];
                    diff_vector[i] = local_diff;
                    l2_norm += (local_diff * local_diff);
                }
            }
        }
        else
        {
            #pragma omp parallel
            {
                numberOfThreads = omp_get_num_threads();
                #pragma omp parallel for private(i)
                for (i = 0; i < n; i++)
                {
                    double local_diff = x[i] - xseq[i];
                    diff_vector[i] = local_diff;
                    l2_norm += (local_diff * local_diff);
                }
            }
        }
        l2_norm = sqrt(l2_norm);
        time_end_openmp = omp_get_wtime();
        openmp_exec_time = time_end_openmp - time_start_openmp;
        // print matrix size, number of processors, number of threads, time, time_openmp, L2 norm of difference of x and xseq (use %.12e while printing norm)
        if (world_size == 1)
        {
            printf("OPENMP: %d %ld %Lf %.12e\n", n, numberOfThreads, openmp_exec_time, openmp_exec_time, l2_norm);
            printf("NEW_OPENMP: %d %ld %f %.12e\n", n, numberOfThreads, openmp_exec_time, l2_norm);
        }
        printf("MIN_AVG_MAX: %d %d %f %f %f\n", n, world_size, min_exec, max_exec, avg_exec);
        printf("MPI: %d %d %f %.12Lf %.12e\n", n, world_size, max_exec, l2_norm, l2_norm);
        totalMemUsage = totalMemUsage / (1024 * 1024 * 1024);
        printf("TOTALMEMUSAGE: %zu\n", totalMemUsage);

        //printf("process: %d %d %d %f %.12e\n", taskid, n, world_size, parallel_exec_time, l2_norm);
        //printf("%d %ld %f %.12e\n", n, numberOfThreads, openmp_exec_time, l2_norm);
    }
    MPI_Finalize();
    return 0;
}

這是output；


cn009
36
mpicc -fopenmp -g -ggdb  -lm -o rowmv rowmv.c 


OPENMP: 32000 1 0.000299 2.991110086441e-04
MIN_AVG_MAX: 32000 1 3.112523 3.112523 3.112523
MPI: 32000 1 3.112523 0.000000000000 9.532824124368e-130
TOTALMEMUSAGE: 15


OPENMP: 32000 2 0.000535 5.350699648261e-04
MIN_AVG_MAX: 32000 1 3.125519 3.125519 3.125519
MPI: 32000 1 3.125519 0.000000000000 9.532824124368e-130
TOTALMEMUSAGE: 15


OPENMP: 32000 4 0.000434 4.341900348663e-04
MIN_AVG_MAX: 32000 1 3.170650 3.170650 3.170650
MPI: 32000 1 3.170650 0.000000000000 9.532824124368e-130
TOTALMEMUSAGE: 15


OPENMP: 32000 8 0.000454 4.542167298496e-04
MIN_AVG_MAX: 32000 1 3.168685 3.168685 3.168685
MPI: 32000 1 3.168685 0.000000000000 9.532824124368e-130
TOTALMEMUSAGE: 15


OPENMP: 32000 16 0.000507 5.065393634140e-04
MIN_AVG_MAX: 32000 1 3.158761 3.158761 3.158761
MPI: 32000 1 3.158761 0.000000000000 9.532824124368e-130
TOTALMEMUSAGE: 15


OPENMP: 32000 32 0.000875 8.752988651395e-04
MIN_AVG_MAX: 32000 1 3.166051 3.166051 3.166051
MPI: 32000 1 3.166051 0.000000000000 9.532824124368e-130
TOTALMEMUSAGE: 15

Answer 1

我看錯了還是使用 32 個線程比 1 個線程慢 10 倍？ 那么，我在這里做錯了什么？

在使用 OpenMP 進行分析和並行化的代碼部分中：

 #pragma omp parallel
 {
    numberOfThreads = omp_get_num_threads();
    #pragma omp parallel for private(i)
    for (i = 0; i < n; i++)
    {
        double local_diff = x[i] - xseq[i];
        diff_vector[i] = local_diff;
        l2_norm += (local_diff * local_diff);
    }
 }

有一個競爭條件，即訪問變量l2_norm 。 此外，您可以刪除private(i) ，因為並行循環中的索引變量（即i ）將被 OpenMP 隱式設置為私有。 競爭條件可以通過 OpenMP縮減來修復。 此外，您的循環實際上並沒有按照您的意願在線程之間分配迭代。 因為您再次將並行子句添加到該#pragma omp for ，並假設您已禁用嵌套並行性，默認情況下，在外部parallel region中創建的每個線程都將“按順序”執行該區域內的代碼，即：

    #pragma omp parallel for private(i)
    for (i = 0; i < n; i++)
    {
        double local_diff = x[i] - xseq[i];
        diff_vector[i] = local_diff;
        l2_norm += (local_diff * local_diff);
    }

因此，每個線程將執行您打算並行化的循環的所有N次迭代。 因此，消除了並行性並為順序代碼增加了額外的開銷（例如，線程創建）。 要解決這些問題（即競爭條件和“嵌套”並行區域），請將此代碼更改為：

 #pragma omp parallel
 {
    numberOfThreads = omp_get_num_threads();
    #pragma omp for reduction(+:l2_norm)
    for (i = 0; i < n; i++)
    {
        double local_diff = x[i] - xseq[i];
        diff_vector[i] = local_diff;
        l2_norm += (local_diff * local_diff);
    }
 }

現在，解決了這些問題后，您仍然面臨另一個問題（性能方面），即並行循環是在OpenMP + MPI的混合並行化的上下文中執行的，並且您沒有顯式綁定OpenMP線程（在MPI進程）到相應的核心。 如果沒有這種顯式綁定，就無法確定這些線程最終會在哪個內核中運行。 自然地，在同一個邏輯核心中運行多個線程通常會增加被並行化的應用程序的整體執行。

如果您的應用程序使用線程，那么您可能希望確保您根本沒有被綁定（通過指定 --bind-to none），或者使用適當的綁定級別或每個應用程序特定數量的處理元素綁定到多個內核過程。 您可以通過以下任一方式解決此問題：

使用 MPI 標志--bind-to none禁用綁定，以將線程分配給不同的內核；
或相應地執行線程綁定。 檢查此SO 線程，了解如何在MPI + OpenMP等混合並行化中將線程 map 到內核。

通過相應地顯式設置每個進程的線程數，您可以避免多個線程最終位於同一個內核中，從而避免同一個內核中的線程爭奪相同的資源。

建議：

IMO 你應該首先單獨測試OpenMP的性能，而不需要任何 MPI 進程。 在這種情況下，通過針對2線程，然后是4 、 8等測量順序版本來測試代碼的可伸縮性，逐漸增加線程的數量。 最終，將有許多線程的代碼只是停止擴展。 自然，線程執行的並行工作量必須足夠大以克服並行性的開銷。 因此，您還應該使用越來越大的輸入進行測試。

在分析、測試改進的OpenMP版本之后，您可以使用MPI共享內存並行化與多個進程。

Answer 2

除了@dreamcrash 的回答中提到的更新共享變量的競爭條件外，您的代碼沒有正確分配工作。

#pragma omp parallel
{
    numberOfThreads = omp_get_num_threads();
    #pragma omp parallel for private(i)
                ~~~~~~~~
    for (i = 0; i < n; i++)
    {
        double local_diff = x[i] - xseq[i];
        diff_vector[i] = local_diff;
        l2_norm += (local_diff * local_diff);
    }
}

內部循環中的parallel構造使其for嵌套組合並行構造。 這意味着執行外部並行循環的團隊中的每個線程都會產生一個全新的並行區域，並將i循環分布在其中的線程上。 在外部並行區域中沒有發生分布，您最終會得到N個線程都在重復完全相同的工作。 默認情況下，嵌套並行被禁用，因此嵌套並行區域按順序運行，您的代碼有效地執行此操作：

#pragma omp parallel
{
    numberOfThreads = omp_get_num_threads();
    for (i = 0; i < n; i++)
    {
        double local_diff = x[i] - xseq[i];
        diff_vector[i] = local_diff;
        l2_norm += (local_diff * local_diff);
    }
}

沒有工作分配，所有線程都寫入diff_vector[]數組中的相同位置。

一方面，此代碼通常是受內存限制的，因為每字節數據的計算量很低——現代 CPU 在從 memory 獲取數據並將結果寫回那里需要許多周期時，每個周期可以執行許多乘法和減法。 由於限制因素是 memory 帶寬，因此內存限制問題不會隨着更多線程而變得更快。 在您的情況下，這不是什么大問題，因為 32K 數組條目占用了 256 KB 的 memory 並且適合大多數 CPU 緩存，並且 L3 緩存速度非常快，但仍然大於單個 L1 緩存中最快的緩存CPU核心。 另一方面，從多個線程寫入相同的 memory 區域會導致真假共享，相關的線程間緩存失效，這通常會導致並行代碼運行方式比順序版本慢。

有一些工具可以幫助您分析代碼的性能並發現問題。 正如我在評論中所寫的，英特爾 VTune 就是其中之一，並且作為 oneAPI 工具包的一部分免費提供。 Intel Inspector 是另一種（同樣免費並且是 oneAPI 工具包的一部分），它可以發現諸如數據競爭之類的問題。 這兩個工具可以很好地協同工作，我不能向任何有抱負的並行程序員強烈推薦它們。

還有一個小的競爭條件寫入numberOfThreads ，但是由於寫入的所有值都是相同的，所以這不是什么邏輯問題。 有問題的代碼的正確版本應該是：

#pragma omp parallel
{
    #pragma omp master
    numberOfThreads = omp_get_num_threads();

    #pragma omp parallel reduction(+:l2_norm)
    for (i = 0; i < n; i++)
    {
        double local_diff = x[i] - xseq[i];
        diff_vector[i] = local_diff;
        l2_norm += (local_diff * local_diff);
    }
}

為什么openmp 32線程比1線程慢得多？

問題描述

2 個解決方案

解決方案1
2 已采納 2020-12-16 18:33:42

解決方案2
1 2020-12-17 12:49:51

為什么openmp 32線程比1線程慢得多？

問題描述

2 個解決方案

解決方案1 2 已采納 2020-12-16 18:33:42

解決方案2 1 2020-12-17 12:49:51

解決方案1
2 已采納 2020-12-16 18:33:42

解決方案2
1 2020-12-17 12:49:51