为什么openmp 32线程比1线程慢得多？

Question

I am trying to write an application calculating l2 norm of 2 arrays.我正在尝试编写一个计算 2 arrays 的 l2 范数的应用程序。 I have to parallel my calculation.我必须并行计算。

Here is the code that I have parallelized:这是我并行化的代码：

  double time_start_openmp = omp_get_wtime();
  #pragma omp parallel for
  for (i = 0; i < n; i++)
  {
       numberOfThreads = omp_get_num_threads();
       double local_diff = x[i] - xseq[i];
       diff_vector[i] = local_diff;
       l2_norm += (local_diff * local_diff);
  }

   time_end_openmp = omp_get_wtime();

   l2_norm = sqrt(l2_norm);

   openmp_exec_time = time_end_openmp - time_start_openmp;
   printf("OPENMP: %d %ld %f %.12e\n", n, numberOfThreads, openmp_exec_time, l2_norm);

I compile the code as:我将代码编译为：

gcc -fopenmp -g -ggdb -Wall -lm -o test test.c

I am running this code with 1 threads and 32 threads.我正在使用 1 个线程和 32 个线程运行此代码。 The output is the exact opposite of what's expected. output 与预期完全相反。 Here is an example output:这是一个示例 output：

[hayri@hayri-durmaz MatrixMultipication_MPI]$ export OMP_NUM_THREADS=32
[hayri@hayri-durmaz MatrixMultipication_MPI]$ ./test 10000
OPENMP: 10000 32 0.001084 0.000000000000e+00
[hayri@hayri-durmaz MatrixMultipication_MPI]$ export OMP_NUM_THREADS=1
[hayri@hayri-durmaz MatrixMultipication_MPI]$ ./test 10000
OPENMP: 10000 1 0.000106 0.000000000000e+00

Am I seeing wrong or using 32 threads is 10 times slower than 1 thread?我看错了还是使用 32 个线程比 1 个线程慢 10 倍？ So, what am I doing wrong here?那么，我在这里做错了什么？

Here is my full code:这是我的完整代码：

#include "mpi.h"
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <omp.h>
#include <math.h>

#define MATSIZE 2000

static size_t totalMemUsage = 0;

size_t vectors_dot_prod(double *x, double *y, size_t n)
{
    double res = 0.0;
    size_t i;
    for (i = 0; i < n; i++)
    {
        res += x[i] * y[i];
    }
    return res;
}

size_t vectors_dot_prod2(double *x, double *y, size_t n)
{
    size_t res = 0.0;
    size_t i = 0;
    for (; i <= n - 4; i += 4)
    {
        res += (x[i] * y[i] +
                x[i + 1] * y[i + 1] +
                x[i + 2] * y[i + 2] +
                x[i + 3] * y[i + 3]);
    }
    for (; i < n; i++)
    {
        res += x[i] * y[i];
    }
    return res;
}

void matrix_vector_mult(double **mat, double *vec, double *result, size_t rows, size_t cols)
{ // in matrix form: result = mat * vec;
    size_t i;
    for (i = 0; i < rows; i++)
    {
        result[i] = vectors_dot_prod2(mat[i], vec, cols);
    }
}

double get_random()
{

    double range = 1000;
    double div = RAND_MAX / range;
    double randomNumber = (rand() / div);
    // printf("%d\n", randomNumber);
    return randomNumber;
}

void print_2d_arr(double *arr, size_t row, size_t col)
{
    size_t i, j, index;

    for (i = 0; i < row; i++)
    {
        for (j = 0; j < col; j++)
        {
            index = i * col + j;
            printf("%3f ", arr[index]);
        }
        printf("\n");
    }
}
void print_1d_arr(double *arr, size_t row)
{
    size_t i;
    for (i = 0; i < row; i++)
    {
        printf("%f, ", arr[i]);
    }
    printf("\n");
}

size_t **fullfillArrayWithRandomNumbers(double *arr, size_t n)
{
    /*
    * Fulfilling the array with random numbers 
    * */
    size_t i;
    for (i = 0; i < n; i++)
    {
        arr[i] = get_random();
    }
    return 0;
}

double *allocarray1D(size_t size)
{
    double *array = calloc(size, sizeof(double));
    totalMemUsage = totalMemUsage + size * sizeof(double);
    return array;
}

size_t ParallelRowMatrixVectorMultiply(size_t n, double *a, double *b, double *x, MPI_Comm comm)
{
    size_t i, j;
    size_t nlocal;
    double *fb;
    int npes, myrank;
    MPI_Comm_size(comm, &npes);
    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
    fb = (double *)malloc(n * sizeof(double));
    nlocal = n / npes;
    MPI_Allgather(b, nlocal, MPI_DOUBLE, fb, nlocal, MPI_DOUBLE, comm);
    for (i = 0; i < nlocal; i++)
    {
        x[i] = 0.0;
        for (j = 0; j < n; j++)
        {
            size_t index = i * n + j;
            x[i] += a[index] * fb[j];
        }
    }
    free(fb);
    return 0;
}

size_t ParallelRowMatrixVectorMultiply_WithoutAllgather(size_t n, double *a, double *b, double *x_partial, double *x, MPI_Comm comm)
{

    // Process 0 sends b to everyone
    MPI_Bcast(b, n, MPI_DOUBLE, 0, MPI_COMM_WORLD);

    size_t i, j;
    size_t nlocal;
    // double *fb;
    int npes, myrank;
    MPI_Comm_size(comm, &npes);
    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
    // fb = (double *)malloc(n * sizeof(double));
    nlocal = n / npes;
    // MPI_Allgather(b, nlocal, MPI_DOUBLE, fb, nlocal, MPI_DOUBLE, comm);
    for (i = 0; i < nlocal; i++)
    {
        x_partial[i] = 0.0;
        for (j = 0; j < n; j++)
        {
            size_t index = i * n + j;
            // printf("%f x %f\n", a[index], b[j]);
            x_partial[i] += a[index] * b[j];
        }
    }
    // free(b);

    // Process 0 gathers x_partials to create x
    MPI_Gather(x_partial, nlocal, MPI_DOUBLE, x, nlocal, MPI_DOUBLE, 0, MPI_COMM_WORLD);
    return 0;
}

size_t SequentialMatrixMultiply(size_t n, double *a, double *b, double *x)
{
    size_t i, j;
    for (i = 0; i < n; i++)
    {
        x[i] = 0.0;
        for (j = 0; j < n; j++)
        {
            size_t index = i * n + j;
            // printf("%f x %f\n", a[index], b[j]);
            x[i] += a[index] * b[j];
        }
    }
    return 0;
}

int main(int argc, char *argv[])
{
    // Global declerations
    size_t i;
    // MPI_Status status;

    // Initialize the MPI environment
    MPI_Init(&argc, &argv);

    // Get the number of processes
    int world_size;
    MPI_Comm_size(MPI_COMM_WORLD, &world_size);

    // Get the rank of the process
    int taskid;
    MPI_Comm_rank(MPI_COMM_WORLD, &taskid);

    // Get the name of the processor
    char processor_name[MPI_MAX_PROCESSOR_NAME];
    int name_len;
    MPI_Get_processor_name(processor_name, &name_len);

    if (argc != 2)
    {
        if (taskid == 0)
            printf("Usage: %s <N>\n", argv[0]);
        MPI_Finalize();
        return 0;
    }
    srand(time(NULL) + taskid);
    size_t n = atoi(argv[1]);
    size_t nOverK = n / world_size;

    double *a = allocarray1D(n * n);
    double *b = allocarray1D(n);
    double *x = allocarray1D(n);
    double *x_partial = allocarray1D(nOverK);
    double *xseq = allocarray1D(n);

    double *a_partial = allocarray1D(n * nOverK);

    if (a == NULL || b == NULL || x == NULL || xseq == NULL || x_partial == NULL)
    {
        if (taskid == 0)
            printf("Allocation failed\n");
        MPI_Finalize();
        return 0;
    }
    // Process 0 creates A matrix.
    if (taskid == 0)
    {
        fullfillArrayWithRandomNumbers(a, n * n);
        // Process 0 produces the b
        fullfillArrayWithRandomNumbers(b, n);
    }

    // Process 0 sends a_partial to everyone
    if (!(world_size == 1 && n == 64000))
    {
        MPI_Scatter(a, n * nOverK, MPI_DOUBLE, a_partial, n * nOverK, MPI_DOUBLE, 0, MPI_COMM_WORLD);
    }

    MPI_Barrier(MPI_COMM_WORLD);
    double time_start = MPI_Wtime();
    ParallelRowMatrixVectorMultiply_WithoutAllgather(n, a_partial, b, x_partial, x, MPI_COMM_WORLD);
    double time_end = MPI_Wtime();
    double parallel_exec_time = time_end - time_start;

    double *exec_times = allocarray1D(world_size);
    // Process 0 gathers x_partials to create x
    MPI_Gather(&parallel_exec_time, 1, MPI_DOUBLE, exec_times, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
    // print_1d_arr(x, n);

    if (taskid == 0)
    {
        SequentialMatrixMultiply(n, a, b, xseq);
        // check difference between x and xseq using OpenMP
        //print_1d_arr(exec_times, world_size);
        // print_1d_arr(xseq, n);
        double max_exec, min_exec, avg_exec;
        min_exec = 1000;
        for (i = 0; i < world_size; i++)
        {
            if (max_exec < exec_times[i])
            {
                max_exec = exec_times[i];
            }
            if (min_exec > exec_times[i])
            {
                min_exec = exec_times[i];
            }
            avg_exec += exec_times[i];
        }
        avg_exec = avg_exec / world_size;

        long double time_start_openmp = omp_get_wtime();
        long double time_end_openmp, openmp_exec_time, min_exec_time, max_exec_time, avg_exec_time;
        max_exec_time = 0;
        max_exec_time = 1000;
        long double l2_norm = 0;
        size_t numberOfThreads = 0;
        size_t r = 0;
        double *diff_vector = allocarray1D(n);
        size_t nrepeat = 10000;

        if (world_size == 1)
        {
            #pragma omp parallel
            {
                numberOfThreads = omp_get_num_threads();
                #pragma omp parallel for private(i)
                for (i = 0; i < n; i++)
                {
                    double local_diff = x[i] - xseq[i];
                    diff_vector[i] = local_diff;
                    l2_norm += (local_diff * local_diff);
                }
            }
        }
        else
        {
            #pragma omp parallel
            {
                numberOfThreads = omp_get_num_threads();
                #pragma omp parallel for private(i)
                for (i = 0; i < n; i++)
                {
                    double local_diff = x[i] - xseq[i];
                    diff_vector[i] = local_diff;
                    l2_norm += (local_diff * local_diff);
                }
            }
        }
        l2_norm = sqrt(l2_norm);
        time_end_openmp = omp_get_wtime();
        openmp_exec_time = time_end_openmp - time_start_openmp;
        // print matrix size, number of processors, number of threads, time, time_openmp, L2 norm of difference of x and xseq (use %.12e while printing norm)
        if (world_size == 1)
        {
            printf("OPENMP: %d %ld %Lf %.12e\n", n, numberOfThreads, openmp_exec_time, openmp_exec_time, l2_norm);
            printf("NEW_OPENMP: %d %ld %f %.12e\n", n, numberOfThreads, openmp_exec_time, l2_norm);
        }
        printf("MIN_AVG_MAX: %d %d %f %f %f\n", n, world_size, min_exec, max_exec, avg_exec);
        printf("MPI: %d %d %f %.12Lf %.12e\n", n, world_size, max_exec, l2_norm, l2_norm);
        totalMemUsage = totalMemUsage / (1024 * 1024 * 1024);
        printf("TOTALMEMUSAGE: %zu\n", totalMemUsage);

        //printf("process: %d %d %d %f %.12e\n", taskid, n, world_size, parallel_exec_time, l2_norm);
        //printf("%d %ld %f %.12e\n", n, numberOfThreads, openmp_exec_time, l2_norm);
    }
    MPI_Finalize();
    return 0;
}

Here is the output;这是output；


cn009
36
mpicc -fopenmp -g -ggdb  -lm -o rowmv rowmv.c 


OPENMP: 32000 1 0.000299 2.991110086441e-04
MIN_AVG_MAX: 32000 1 3.112523 3.112523 3.112523
MPI: 32000 1 3.112523 0.000000000000 9.532824124368e-130
TOTALMEMUSAGE: 15


OPENMP: 32000 2 0.000535 5.350699648261e-04
MIN_AVG_MAX: 32000 1 3.125519 3.125519 3.125519
MPI: 32000 1 3.125519 0.000000000000 9.532824124368e-130
TOTALMEMUSAGE: 15


OPENMP: 32000 4 0.000434 4.341900348663e-04
MIN_AVG_MAX: 32000 1 3.170650 3.170650 3.170650
MPI: 32000 1 3.170650 0.000000000000 9.532824124368e-130
TOTALMEMUSAGE: 15


OPENMP: 32000 8 0.000454 4.542167298496e-04
MIN_AVG_MAX: 32000 1 3.168685 3.168685 3.168685
MPI: 32000 1 3.168685 0.000000000000 9.532824124368e-130
TOTALMEMUSAGE: 15


OPENMP: 32000 16 0.000507 5.065393634140e-04
MIN_AVG_MAX: 32000 1 3.158761 3.158761 3.158761
MPI: 32000 1 3.158761 0.000000000000 9.532824124368e-130
TOTALMEMUSAGE: 15


OPENMP: 32000 32 0.000875 8.752988651395e-04
MIN_AVG_MAX: 32000 1 3.166051 3.166051 3.166051
MPI: 32000 1 3.166051 0.000000000000 9.532824124368e-130
TOTALMEMUSAGE: 15

Answer 1

Am I seeing wrong or using 32 threads is 10 times slower than 1 thread?我看错了还是使用 32 个线程比 1 个线程慢 10 倍？ So, what am I doing wrong here?那么，我在这里做错了什么？

In the portion of code that is being both profiled and parallelized with OpenMP:在使用 OpenMP 进行分析和并行化的代码部分中：

 #pragma omp parallel
 {
    numberOfThreads = omp_get_num_threads();
    #pragma omp parallel for private(i)
    for (i = 0; i < n; i++)
    {
        double local_diff = x[i] - xseq[i];
        diff_vector[i] = local_diff;
        l2_norm += (local_diff * local_diff);
    }
 }

there is a race condition, namely the access to the variable l2_norm .有一个竞争条件，即访问变量l2_norm 。 Moreover, you can drop the private(i) , since the index variable ( ie, i ) in the parallelized loop will be set implicitly as private by OpenMP.此外，您可以删除private(i) ，因为并行循环中的索引变量（即i ）将被 OpenMP 隐式设置为私有。 The race condition can be fixed with the OpenMP reduction .竞争条件可以通过 OpenMP缩减来修复。 Furthermore, your loop is not actually distributing the iterations among threads as you wanted.此外，您的循环实际上并没有按照您的意愿在线程之间分配迭代。 Because you added again the parallel clause to that #pragma omp for , and assuming that you have nested parallelism disabled, which by default it is, each of the threads created in the outer parallel region will execute "sequentially" the code within that region, namely:因为您再次将并行子句添加到该#pragma omp for ，并假设您已禁用嵌套并行性，默认情况下，在外部parallel region中创建的每个线程都将“按顺序”执行该区域内的代码，即：

    #pragma omp parallel for private(i)
    for (i = 0; i < n; i++)
    {
        double local_diff = x[i] - xseq[i];
        diff_vector[i] = local_diff;
        l2_norm += (local_diff * local_diff);
    }

Hence, each thread will execute all the N iterations of the loop that you intended to be parallelized.因此，每个线程将执行您打算并行化的循环的所有N次迭代。 Consequently, removing the parallelism and adding additional overhead ( eg, thread creation) to the sequential code.因此，消除了并行性并为顺序代码增加了额外的开销（例如，线程创建）。 To fix those problems ( ie, race condition and "nested" parallel region) change this code to:要解决这些问题（即竞争条件和“嵌套”并行区域），请将此代码更改为：

 #pragma omp parallel
 {
    numberOfThreads = omp_get_num_threads();
    #pragma omp for reduction(+:l2_norm)
    for (i = 0; i < n; i++)
    {
        double local_diff = x[i] - xseq[i];
        diff_vector[i] = local_diff;
        l2_norm += (local_diff * local_diff);
    }
 }

Now, having fixed those problems you are left still with another problem (performance-wise), namely that the parallel loop is being performed in the context of a hybrid parallelization of OpenMP + MPI , and you did not explicitly bind the OpenMP threads (within the MPI processes) to the corresponded cores.现在，解决了这些问题后，您仍然面临另一个问题（性能方面），即并行循环是在OpenMP + MPI的混合并行化的上下文中执行的，并且您没有显式绑定OpenMP线程（在MPI进程）到相应的核心。 Without that explicit binding, one cannot be sure in which cores those threads will end up.如果没有这种显式绑定，就无法确定这些线程最终会在哪个内核中运行。 Naturally, more often than not, having multiple threads running in the same logical core will increase the overall execution of the application being parallelized.自然地，在同一个逻辑核心中运行多个线程通常会增加被并行化的应用程序的整体执行。

If your application uses threads, then you probably want to ensure that you are either not bound at all (by specifying --bind-to none), or bound to multiple cores using an appropriate binding level or a specific number of processing elements per application process.如果您的应用程序使用线程，那么您可能希望确保您根本没有被绑定（通过指定 --bind-to none），或者使用适当的绑定级别或每个应用程序特定数量的处理元素绑定到多个内核过程。 You can solve this problem by either:您可以通过以下任一方式解决此问题：

disabling the binding with the MPI flag --bind-to none , to enable threads to be assigned to different cores;使用 MPI 标志--bind-to none禁用绑定，以将线程分配给不同的内核；
or perform the bound of threads, accordingly.或相应地执行线程绑定。 Check this SO thread on how to map the threads to cores in Hybrid parallelizations such as MPI + OpenMP .检查此SO 线程，了解如何在MPI + OpenMP等混合并行化中将线程 map 到内核。

By explicitly setting the number of threads per process accordingly, you can avoid that multiple threads end up in the same core, and consequently, avoid that threads within the same core fight for the same resources.通过相应地显式设置每个进程的线程数，您可以避免多个线程最终位于同一个内核中，从而避免同一个内核中的线程争夺相同的资源。

Advice:建议：

IMO you should first test the performance of the OpenMP alone, without any MPI process. IMO 你应该首先单独测试OpenMP的性能，而不需要任何 MPI 进程。 In this context, test the scalability of code by measuring the sequential version against 2 threads, then 4 , 8 , and so on, gradually increasing the number of threads.在这种情况下，通过针对2线程，然后是4 、 8等测量顺序版本来测试代码的可伸缩性，逐渐增加线程的数量。 Eventually, there will be a number of threads for which the code simply stops scaling.最终，将有许多线程的代码只是停止扩展。 Naturally, the amount of parallel work being performed by the threads has to be big enough to overcome the overhead of parallelism.自然，线程执行的并行工作量必须足够大以克服并行性的开销。 Therefore, you should also test around with bigger and bigger inputs.因此，您还应该使用越来越大的输入进行测试。

After having profiled, tested an improved your OpenMP version you can then extent that shared-memory parallelization with multiple processes using MPI .在分析、测试改进的OpenMP版本之后，您可以使用MPI共享内存并行化与多个进程。

Answer 2

Besides the race condition in updating a shared variable as noted in @dreamcrash's answer, your code is not distributing the work properly.除了@dreamcrash 的回答中提到的更新共享变量的竞争条件外，您的代码没有正确分配工作。

#pragma omp parallel
{
    numberOfThreads = omp_get_num_threads();
    #pragma omp parallel for private(i)
                ~~~~~~~~
    for (i = 0; i < n; i++)
    {
        double local_diff = x[i] - xseq[i];
        diff_vector[i] = local_diff;
        l2_norm += (local_diff * local_diff);
    }
}

The parallel construct in the inner loop makes it a nested combined parallel for construct.内部循环中的parallel构造使其for嵌套组合并行构造。 It means that each thread in the team executing the outer parallel loop spawns a brand new parallel region and distributes the i -loop over the threads in it.这意味着执行外部并行循环的团队中的每个线程都会产生一个全新的并行区域，并将i循环分布在其中的线程上。 There is no distribution happening in the outer parallel region and you end up with N threads all repeating the exact same work.在外部并行区域中没有发生分布，您最终会得到N个线程都在重复完全相同的工作。 By default nested parallelism is disabled, so the nested parallel region runs sequentially and your code is effectively doing this:默认情况下，嵌套并行被禁用，因此嵌套并行区域按顺序运行，您的代码有效地执行此操作：

#pragma omp parallel
{
    numberOfThreads = omp_get_num_threads();
    for (i = 0; i < n; i++)
    {
        double local_diff = x[i] - xseq[i];
        diff_vector[i] = local_diff;
        l2_norm += (local_diff * local_diff);
    }
}

There is no distribution of work and all threads write to the same locations in the diff_vector[] array.没有工作分配，所有线程都写入diff_vector[]数组中的相同位置。

On one hand, this code in general is a memory-bound one since the amount of computation per byte of data is low - modern CPUs can do many multiplications and subtractions per cycle while fetching data from memory and writing results back there takes many cycles.一方面，此代码通常是受内存限制的，因为每字节数据的计算量很低——现代 CPU 在从 memory 获取数据并将结果写回那里需要许多周期时，每个周期可以执行许多乘法和减法。 Memory-bound problems don't get any faster with more threads since the limiting factor is the memory bandwidth.由于限制因素是 memory 带宽，因此内存限制问题不会随着更多线程而变得更快。 This isn't that big of a problem in your case because 32K array entries take up 256 KB of memory and that fits in most CPU caches, and the L3 cache is blazing fast, but is still larger than the fastest L1 cache of a single CPU core.在您的情况下，这不是什么大问题，因为 32K 数组条目占用了 256 KB 的 memory 并且适合大多数 CPU 缓存，并且 L3 缓存速度非常快，但仍然大于单个 L1 缓存中最快的缓存CPU核心。 On the other hand, writing to the same memory areas from multiple threads results in true and false sharing, with the associated inter-thread cache invalidation, which usually results in the parallel code running way slower than the sequential version.另一方面，从多个线程写入相同的 memory 区域会导致真假共享，相关的线程间缓存失效，这通常会导致并行代码运行方式比顺序版本慢。

There are tools that can help you analyse the performance of your code and spot problems.有一些工具可以帮助您分析代码的性能并发现问题。 As I already wrote in a comment, Intel VTune is one of them and is freely available as part of the oneAPI toolkit.正如我在评论中所写的，英特尔 VTune 就是其中之一，并且作为 oneAPI 工具包的一部分免费提供。 Intel Inspector is another one (again, free and part of the oneAPI toolkit) and it finds problems such as data races. Intel Inspector 是另一种（同样免费并且是 oneAPI 工具包的一部分），它可以发现诸如数据竞争之类的问题。 The two tools work very well together and I couldn't recommend them strongly enough to any aspiring parallel programmer.这两个工具可以很好地协同工作，我不能向任何有抱负的并行程序员强烈推荐它们。

There is also a minor race condition writing to numberOfThreads , but since all values written are the same, that isn't much of a logical problem.还有一个小的竞争条件写入numberOfThreads ，但是由于写入的所有值都是相同的，所以这不是什么逻辑问题。 The correct version of the code in question should be:有问题的代码的正确版本应该是：

#pragma omp parallel
{
    #pragma omp master
    numberOfThreads = omp_get_num_threads();

    #pragma omp parallel reduction(+:l2_norm)
    for (i = 0; i < n; i++)
    {
        double local_diff = x[i] - xseq[i];
        diff_vector[i] = local_diff;
        l2_norm += (local_diff * local_diff);
    }
}

为什么openmp 32线程比1线程慢得多？

问题描述

2 个解决方案

解决方案1
2 已采纳 2020-12-16 18:33:42

解决方案2
1 2020-12-17 12:49:51

为什么openmp 32线程比1线程慢得多？

问题描述

2 个解决方案

解决方案1 2 已采纳 2020-12-16 18:33:42

解决方案2 1 2020-12-17 12:49:51

解决方案1
2 已采纳 2020-12-16 18:33:42

解决方案2
1 2020-12-17 12:49:51