OpenACC - 复杂循环对 a->,c->,b-> 的依赖阻止了并行化

Question

我正在使用 OpenACC 比较使用 PGI 社区版 19.10（在 Windows 上）在 CPU 上并行化和非并行化矩阵乘法运算的执行时间。 我正在使用的代码是：

#include <time.h>
#include <stdlib.h>

int main()
{
    // seed the random number generator
    srand(42);

    // Pick some arbitrary constraints to make the problem harder
    const int SIZE_XY = 1000;
    const int MIN_VAL = 5000;   
    const int MAX_VAL = 7000000;

    int i, j, k; // iterators

    double time_spent = 0.0;
    clock_t begin = clock();

    // Generate two 2D arrays to be filled with random numbers
    // and an array, c, with all 0s
    int *a[SIZE_XY];
    int *b[SIZE_XY];
    int *c[SIZE_XY];
    for (i = 0; i < SIZE_XY; i++)
    {
        a[i] = (int *)malloc(SIZE_XY * sizeof(int));
        b[i] = (int *)malloc(SIZE_XY * sizeof(int));
        c[i] = (int *)malloc(SIZE_XY * sizeof(int));
    }

    #pragma acc kernels
    {
        for (i = 0; i < SIZE_XY; i++)
        {
            for (j = 0; j < SIZE_XY; j++)
            {
                a[i][j] = (rand() % MAX_VAL) + MIN_VAL;
                b[i][j] = (rand() % MAX_VAL) + MIN_VAL;
                c[i][j] = 0;
            }
        }
    }

    printf("Array A allocated and filled with random numbers ...\n");
    printf("Array B allocated and filled with random numbers ...\n");
    printf("Array C initialized ...\n");

    // Dot product the two arrays together into c
    #pragma acc kernels //HERE
    {
        for (i = 0; i < SIZE_XY; i++)
        {
            for (j = 0; j < SIZE_XY; j++)
            {
                for (k = 0; k < SIZE_XY; k++)
                {
                    c[i][j] = c[i][j] + a[i][k] * b[k][j];
                }
            }
        }
    }

    printf("Matrices multiplied ...\n");
    printf("The first three values of A x B are %d, %d, %d\n", c[0][0], c[0][1], c[0][2]);

    clock_t end = clock();

    time_spent += (double)(end - begin) / CLOCKS_PER_SEC;

    printf("Time elpased is %f seconds", time_spent);
}

当我在 PGI CMD 中运行以下命令时： pgcc -acc -ta=multicore -Minfo=all,accel matrixACC.c我收到以下信息：

59, Complex loop carried dependence of a->,c->,b-> prevents parallelization
62, Complex loop carried dependence of a->,c->,b-> prevents parallelization
64, Complex loop carried dependence of a->,c->,b-> prevents parallelization
    Loop carried dependence due to exposed use of c[i1][i2] prevents parallelization

我能否获得一些帮助以了解为什么会发生这种情况以及如何并行化计算矩阵乘法的循环。

谢谢

Answer 1

编译器无法确定您的 3 个指针变量（ a 、 b 、 c ）是否会相互别名。 如果它们以某种方式相互别名，则无法确定计算任何特定c[i][j]的独立性，并且将无法正确并行化（任何）循环。

解决此问题的一种可能方法是通知编译器，您向程序员保证（例如）第一个循环代表独立活动（在其各种迭代中）。 您可以通过将#pragma acc loop independent放置在第一个for循环语句之前来做到这一点。 对于您在此处选择的矩阵大小（和多核目标），这将为您提供大量暴露的并行性。 （编译器仍会发出有关其他循环非并行化的Minfo消息，但这很可能是可以的。对于多核目标，拥有 1000 个并行工作项应该足以获得良好的性能）。

请注意，您的计算很容易溢出您选择的初始化范围的int存储。 你会得到无意义的结果。

以下代码有可能解决上述问题：

$ cat t1.c
#include <time.h>
#include <stdlib.h>
#include <stdio.h>
int main()
{
    // seed the random number generator
    srand(42);

    // Pick some arbitrary constraints to make the problem harder
    const int SIZE_XY = 1000;
    const int MIN_VAL = 5000;
    const int MAX_VAL = 7000000;

    int i, j, k; // iterators

    double time_spent = 0.0;
    clock_t begin = clock();

    // Generate two 2D arrays to be filled with random numbers
    // and an array, c, with all 0s
    int * restrict a[SIZE_XY];
    int * restrict b[SIZE_XY];
    int * restrict c[SIZE_XY];
    for (i = 0; i < SIZE_XY; i++)
    {
        a[i] = (int *)malloc(SIZE_XY * sizeof(int));
        b[i] = (int *)malloc(SIZE_XY * sizeof(int));
        c[i] = (int *)malloc(SIZE_XY * sizeof(int));
    }

    #pragma acc kernels
    {
        for (i = 0; i < SIZE_XY; i++)
        {
            for (j = 0; j < SIZE_XY; j++)
            {
                a[i][j] = 1; //(rand() % MAX_VAL) + MIN_VAL;
                b[i][j] = 1; //(rand() % MAX_VAL) + MIN_VAL;
                c[i][j] = 0;
            }
        }
    }

    printf("Array A allocated and filled with random numbers ...\n");
    printf("Array B allocated and filled with random numbers ...\n");
    printf("Array C initialized ...\n");

    // Dot product the two arrays together into c
    #pragma acc kernels //HERE
    {
        #pragma acc loop independent
        for (i = 0; i < SIZE_XY; i++)
        {
            for (j = 0; j < SIZE_XY; j++)
            {
                for (k = 0; k < SIZE_XY; k++)
                {
                    c[i][j] = c[i][j] + a[i][k] * b[k][j];
                }
            }
        }
    }

    printf("Matrices multiplied ...\n");
    printf("The first three values of A x B are %d, %d, %d\n", c[0][0], c[0][1], c[0][2]);

    clock_t end = clock();

    time_spent += (double)(end - begin) / CLOCKS_PER_SEC;

    printf("Time elpased is %f seconds", time_spent);
}
$ gcc -o t1 t1.c -std=c99
$ pgcc -acc -ta=multicore -Minfo=all,accel t1.c -o t1p
"t1.c", line 21: warning: use of a const variable in a constant expression is
          nonstandard in C
      int * restrict a[SIZE_XY];
                       ^

"t1.c", line 22: warning: use of a const variable in a constant expression is
          nonstandard in C
      int * restrict b[SIZE_XY];
                       ^

"t1.c", line 23: warning: use of a const variable in a constant expression is
          nonstandard in C
      int * restrict c[SIZE_XY];
                       ^

"t1.c", line 11: warning: variable "MIN_VAL" was declared but never referenced
      const int MIN_VAL = 5000;
                ^

"t1.c", line 12: warning: variable "MAX_VAL" was declared but never referenced
      const int MAX_VAL = 7000000;
                ^

main:
     33, Loop is parallelizable
         Generating Multicore code
         33, #pragma acc loop gang
     35, Loop is parallelizable
     52, Loop is parallelizable
         Generating Multicore code
         52, #pragma acc loop gang
     54, Complex loop carried dependence of a->,c->,b-> prevents parallelization
     56, Complex loop carried dependence of a->,c->,b-> prevents parallelization
         Loop carried dependence of c-> prevents parallelization
         Loop carried backward dependence of c-> prevents vectorization
$ time ./t1
Array A allocated and filled with random numbers ...
Array B allocated and filled with random numbers ...
Array C initialized ...
Matrices multiplied ...
The first three values of A x B are 1000, 1000, 1000
Time elpased is 9.010000 seconds
real    0m9.079s
user    0m9.019s
sys     0m0.061s
$ time ./t1p
Array A allocated and filled with random numbers ...
Array B allocated and filled with random numbers ...
Array C initialized ...
Matrices multiplied ...
The first three values of A x B are 1000, 1000, 1000
Time elpased is 20.140000 seconds
real    0m0.563s
user    0m20.053s
sys     0m0.132s
$

在我的机器上，使用 gcc 编译的代码大约需要 9 秒，而使用 PGI OpenACC 编译器编译的代码大约需要 0.5 秒。

顺便说一句，我个人通常会避免您选择的数组分配方法，因为不能保证各种malloc操作会导致相邻/连续分配。 然而，对于multicore目标，代码可以正常工作。

为了解决这个问题，我建议对您的代码进行一些更改，如下所示：

$ cat t1.c
#include <time.h>
#include <stdlib.h>
#include <stdio.h>

typedef int mt;
#define SIZE_XY 1000
typedef mt mat[SIZE_XY];

int main()
{
    // seed the random number generator
    srand(42);

    // Pick some arbitrary constraints to make the problem harder

    int i, j, k; // iterators

    double time_spent = 0.0;
    clock_t begin = clock();

    // Generate two 2D arrays to be filled with random numbers
    // and an array, c, with all 0s
    mat * restrict a;
    mat * restrict b;
    mat * restrict c;
    a = (mat *)malloc(SIZE_XY*SIZE_XY * sizeof(mt));
    b = (mat *)malloc(SIZE_XY*SIZE_XY * sizeof(mt));
    c = (mat *)malloc(SIZE_XY*SIZE_XY * sizeof(mt));

    #pragma acc kernels
    {
        for (i = 0; i < SIZE_XY; i++)
        {
            for (j = 0; j < SIZE_XY; j++)
            {
                a[i][j] = 1; //(rand() % MAX_VAL) + MIN_VAL;
                b[i][j] = 1; //(rand() % MAX_VAL) + MIN_VAL;
                c[i][j] = 0;
            }
        }
    }

    printf("Array A allocated and filled with random numbers ...\n");
    printf("Array B allocated and filled with random numbers ...\n");
    printf("Array C initialized ...\n");

    // Dot product the two arrays together into c
    #pragma acc kernels
    {
        for (i = 0; i < SIZE_XY; i++)
        {
            for (j = 0; j < SIZE_XY; j++)
            {
                for (k = 0; k < SIZE_XY; k++)
                {
                    c[i][j] = c[i][j] + a[i][k] * b[k][j];
                }
            }
        }
    }

    printf("Matrices multiplied ...\n");
    printf("The first three values of A x B are %d, %d, %d\n", c[0][0], c[0][1], c[0][2]);

    clock_t end = clock();

    time_spent += (double)(end - begin) / CLOCKS_PER_SEC;

    printf("Time elpased is %f seconds", time_spent);
}
$ gcc -o t1 t1.c -std=c99 -O3
$ pgcc -acc -ta=multicore -Minfo=all,accel t1.c -o t1p
main:
     32, Loop is parallelizable
         Generating Multicore code
         32, #pragma acc loop gang
     34, Loop is parallelizable
     51, Loop is parallelizable
         Generating Multicore code
         51, #pragma acc loop gang
     53, Loop is parallelizable
     55, Complex loop carried dependence of c-> prevents parallelization
         Loop carried dependence of c-> prevents parallelization
         Loop carried backward dependence of c-> prevents vectorization
$ time ./t1
Array A allocated and filled with random numbers ...
Array B allocated and filled with random numbers ...
Array C initialized ...
Matrices multiplied ...
The first three values of A x B are 1000, 1000, 1000
Time elpased is 0.650000 seconds
real    0m0.708s
user    0m0.663s
sys     0m0.047s
$ time ./t1p
Array A allocated and filled with random numbers ...
Array B allocated and filled with random numbers ...
Array C initialized ...
Matrices multiplied ...
The first three values of A x B are 1000, 1000, 1000
Time elpased is 17.510000 seconds
real    0m0.499s
user    0m17.466s
sys     0m0.093s
$

（gcc 4.8.5，pgcc 20.5-0，Xeon E5-2690 v2，总共 40 个内核）

有几个优点：

我们可以使用 c99 关键字restrict我们的意图传达给编译器，而无需使用额外的编译指示
这将是a 、 b和c的连续分配，如果您决定从multicore切换到加速器目标，则行为会更简单
OpenACC 编译器现在可以在没有额外帮助的情况下处理前两个循环嵌套。
gnu 编译器也喜欢这种级别的通信。 在我的机器上，“普通”gnu 编译器 ( gcc ) 发出的代码运行速度几乎与 OpenACC 代码一样快。 （~0.7s 对 ~0.5s）

OpenACC - 复杂循环对 a->,c->,b-> 的依赖阻止了并行化

问题描述

1 个解决方案

解决方案1
0 已采纳 2020-07-25 05:26:06

OpenACC - 复杂循环对 a-&gt;,c-&gt;,b-&gt; 的依赖阻止了并行化

问题描述

1 个解决方案

解决方案1 0 已采纳 2020-07-25 05:26:06

OpenACC - 复杂循环对 a->,c->,b-> 的依赖阻止了并行化

解决方案1
0 已采纳 2020-07-25 05:26:06